SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 8, 2024

Commit

1e878de

verified ·

1 Parent(s): 128980b

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -209

app.py CHANGED Viewed

@@ -5,10 +5,7 @@ import logging
 from urllib.parse import urlparse
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
-from trafilatura import fetch_url, extract
-from trafilatura import extract
 from requests.exceptions import Timeout
-from trafilatura.settings import use_config
 from urllib.request import urlopen, Request
 import json
 from huggingface_hub import InferenceClient
@@ -21,16 +18,10 @@ import os
 from dotenv import load_dotenv
 import certifi
 import requests
-import scrapy
-from scrapy.crawler import CrawlerProcess
-from scrapy import signals
-from scrapy.signalmanager import dispatcher
-from scrapy.utils.log import configure_logging
 from newspaper import Article
 import PyPDF2
 import io
 import requests
-from duckduckgo_search import DDGS
 import random
 import datetime
 from groq import Groq
@@ -60,49 +51,12 @@ client = InferenceClient(
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # Initialize Groq client
-groq_client =  Groq(api_key=GROQ_API_KEY)
 # Initialize the similarity model
 similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
-def duckduckgo_search(query, num_results=10, time_range="", language="", safesearch=2):
-    try:
-        ddgs = DDGS()
-        # Convert time_range to DuckDuckGo format
-        if time_range == "day":
-            timelimit = "d"
-        elif time_range == "week":
-            timelimit = "w"
-        elif time_range == "month":
-            timelimit = "m"
-        elif time_range == "year":
-            timelimit = "y"
-        else:
-            timelimit = None
-        # Convert safesearch to DuckDuckGo format
-        if safesearch == 0:
-            safesearch_setting = "off"
-        elif safesearch == 1:
-            safesearch_setting = "moderate"
-        else:
-            safesearch_setting = "strict"
-        results = ddgs.text(
-            query,
-            region='wt-wt',
-            safesearch=safesearch_setting,
-            timelimit=timelimit,
-            max_results=num_results
-        )
-        return [{"url": result["href"], "title": result["title"]} for result in results]
-    except Exception as e:
-        logger.error(f"Error in DuckDuckGo search: {e}")
-        return []
 # Set up a session with retry mechanism
 def requests_retry_session(
     retries=0,
@@ -155,45 +109,6 @@ def scrape_pdf_content(url, max_chars=3000, timeout=5):
         logger.error(f"Error scraping PDF content from {url}: {e}")
         return ""
-class NewsSpider(scrapy.Spider):
-    name = 'news_spider'
-    def __init__(self, url=None, *args, **kwargs):
-        super(NewsSpider, self).__init__(*args, **kwargs)
-        self.start_urls = [url] if url else []
-    def parse(self, response):
-        content = ' '.join(response.css('p::text').getall())
-        self.logger.info(f"Scraped content length: {len(content)}")
-        return {'content': content}
-def scrape_with_scrapy(url, timeout=30):
-    logger.info(f"Starting to scrape with Scrapy: {url}")
-    configure_logging(install_root_handler=False)
-    logging.getLogger('scrapy').setLevel(logging.WARNING)
-    results = []
-    def spider_results(signal, sender, item, response, spider):
-        results.append(item)
-    process = CrawlerProcess(settings={
-        'LOG_ENABLED': True,
-        'LOG_LEVEL': 'WARNING',
-        'DOWNLOAD_TIMEOUT': timeout
-    })
-    dispatcher.connect(spider_results, signal=signals.item_scraped)
-    process.crawl(NewsSpider, url=url)
-    process.start()
-    # Get the content from results
-    if results:
-        return results[0]['content']
-    return ''
 def scrape_with_newspaper(url):
     if url.lower().endswith('.pdf'):
         return scrape_pdf_content(url)
@@ -225,55 +140,6 @@ def scrape_with_newspaper(url):
         logger.error(f"Error scraping {url} with Newspaper3k: {e}")
         return ""
-def scrape_with_bs4(url, session, max_chars=None):
-    try:
-        response = session.get(url, timeout=5)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, 'html.parser')
-        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
-        if main_content:
-            content = main_content.get_text(strip=True, separator='\n')
-        else:
-            content = soup.get_text(strip=True, separator='\n')
-        return content[:max_chars] if max_chars else content
-    except Exception as e:
-        logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
-        return ""
-def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
-    try:
-        response = requests.get(url, timeout=timeout)
-        response.raise_for_status()
-        downloaded = response.text
-        content = ""
-        if use_beautifulsoup:
-            soup = BeautifulSoup(downloaded, "lxml")
-            # Convert BeautifulSoup object to a string
-            html_string = str(soup)
-            # Use Trafilatura's extract function directly on the HTML string
-            content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
-        # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
-        if not content and use_beautifulsoup:
-            logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
-            content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
-        # If still no content, use the URL directly
-        if not content:
-            content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
-        return (content or "")[:max_chars] if max_chars else (content or "")
-    except requests.Timeout:
-        logger.error(f"Timeout error while scraping {url} with Trafilatura")
-        return ""
-    except Exception as e:
-        logger.error(f"Error scraping {url} with Trafilatura: {e}")
-        return ""
 def rephrase_query(chat_history, query, temperature=0.2):
     system_prompt = f"""
 You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
@@ -552,8 +418,8 @@ Your response should be detailed, informative, accurate, and directly relevant t
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
-def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
-                      engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, use_duckduckgo=False, model="huggingface"):
     try:
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
@@ -564,78 +430,73 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
             return "No search needed for the provided input."
         # Step 2: Perform search
-        if use_duckduckgo:
-            search_results = duckduckgo_search(rephrased_query, num_results, time_range, language, safesearch)
-            results = search_results  # Assign DuckDuckGo results directly
-        else:
-            # Search query parameters
-            params = {
-                'q': rephrased_query,
-                'format': 'json',
-                'time_range': time_range,
-                'language': language,
-                'category': category,
-                'engines': ','.join(engines),
-                'safesearch': safesearch
-            }
-            # Remove empty parameters
-            params = {k: v for k, v in params.items() if v != ""}
-            # If no engines are specified, set default engines
-            if 'engines' not in params:
-                params['engines'] = 'google'  # Default to 'google' or any preferred engine
-                logger.info("No engines specified. Defaulting to 'google'.")
-            # Headers for SearXNG request
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-                'Accept': 'application/json, text/javascript, */*; q=0.01',
-                'Accept-Language': 'en-US,en;q=0.5',
-                'Origin': 'https://shreyas094-searxng-local.hf.space',
-                'Referer': 'https://shreyas094-searxng-local.hf.space/',
-                'DNT': '1',
-                'Connection': 'keep-alive',
-                'Sec-Fetch-Dest': 'empty',
-                'Sec-Fetch-Mode': 'cors',
-                'Sec-Fetch-Site': 'same-origin',
-            }
         scraped_content = []
         page = 1
         while len(scraped_content) < num_results:
-            if not use_duckduckgo:
-                # Update params with current page
-                params['pageno'] = page
-                # Send request to SearXNG
-                logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
-                session = requests_retry_session()
-                try:
-                    if method.upper() == "GET":
-                        response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
-                    else:  # POST
-                        response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
-                    response.raise_for_status()
-                except requests.exceptions.RequestException as e:
-                    logger.error(f"Error during SearXNG request: {e}")
-                    return f"An error occurred during the search request: {e}"
-                search_results = response.json()
-                logger.debug(f"SearXNG Response: {search_results}")
-                results = search_results.get('results', [])
-                if not results:
-                    logger.warning(f"No more results returned from SearXNG on page {page}.")
-                    break
             for result in results:
                 if len(scraped_content) >= num_results:
                     break
-                url = result.get('url', '') if not use_duckduckgo else result.get('url', '')
                 title = result.get('title', 'No title')
                 if not is_valid_url(url):
@@ -645,7 +506,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
                 try:
                     logger.info(f"Processing content from: {url}")
-                    content = scrape_full_content(url, scraper, max_chars, timeout)
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
@@ -655,7 +516,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
                         "title": title,
                         "url": url,
                         "content": content,
-                        "scraper": "pdf" if url.lower().endswith('.pdf') else scraper
                     })
                     logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
                 except requests.exceptions.RequestException as e:
@@ -663,10 +524,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
                 except Exception as e:
                     logger.error(f"Unexpected error while scraping {url}: {e}")
-            if use_duckduckgo:
-                break  # DuckDuckGo search doesn't support pagination in this implementation
-            else:
-                page += 1
         if not scraped_content:
             logger.warning("No content scraped from search results.")
@@ -678,7 +536,6 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
         relevant_documents = []
         unique_summaries = []
         for doc in scraped_content:
-            # In the search_and_scrape function
             assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
             relevance, summary = assessment.split('\n', 1)
@@ -699,7 +556,6 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
         if not relevant_documents:
             logger.warning("No relevant and unique documents found.")
             return "No relevant and unique financial news found for the given query."
-            logger.debug(f"Assessment result: {assessment}")
         # Step 4: Rerank documents based on similarity to query
         reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
@@ -712,7 +568,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
         # Step 5: Scrape full content for top documents (up to num_results)
         for doc in reranked_docs[:num_results]:
-            full_content = scrape_full_content(doc['url'], scraper, max_chars)
             doc['full_content'] = full_content
         # Prepare JSON for LLM
@@ -738,14 +594,13 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
         return f"An unexpected error occurred during the search and scrape process: {e}"
-def chat_function(message, history, num_results, scraper, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, use_duckduckgo, model):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
     response = search_and_scrape(
         query=message,
         chat_history=chat_history,
         num_results=num_results,
-        scraper=scraper,
         max_chars=max_chars,
         time_range=time_range,
         language=language,
@@ -754,7 +609,6 @@ def chat_function(message, history, num_results, scraper, max_chars, time_range,
         safesearch=safesearch,
         method=method,
         llm_temperature=llm_temperature,
-        use_duckduckgo=use_duckduckgo,
         model=model
     )
@@ -767,7 +621,6 @@ iface = gr.ChatInterface(
     theme=gr.Theme.from_hub("allenai/gradio-theme"),
     additional_inputs=[
         gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
-        gr.Dropdown(["bs4", "trafilatura", "scrapy", "newspaper"], value="newspaper", label="Scraping Method"),
         gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
         gr.Dropdown(["", "day", "week", "month", "year"], value="", label="Time Range"),
         gr.Dropdown(["", "all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="", label="Language"),
@@ -781,7 +634,6 @@ iface = gr.ChatInterface(
         gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
         gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
         gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
-        gr.Checkbox(label="Use DuckDuckGo Search", value=False),
         gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
     ],
     additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),

 from urllib.parse import urlparse
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from requests.exceptions import Timeout
 from urllib.request import urlopen, Request
 import json
 from huggingface_hub import InferenceClient
 from dotenv import load_dotenv
 import certifi
 import requests
 from newspaper import Article
 import PyPDF2
 import io
 import requests
 import random
 import datetime
 from groq import Groq
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # Initialize Groq client
+groq_client = Groq(api_key=GROQ_API_KEY)
 # Initialize the similarity model
 similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Set up a session with retry mechanism
 def requests_retry_session(
     retries=0,
         logger.error(f"Error scraping PDF content from {url}: {e}")
         return ""
 def scrape_with_newspaper(url):
     if url.lower().endswith('.pdf'):
         return scrape_pdf_content(url)
         logger.error(f"Error scraping {url} with Newspaper3k: {e}")
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
     system_prompt = f"""
 You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
+def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
+                      engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
     try:
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
             return "No search needed for the provided input."
         # Step 2: Perform search
+        # Search query parameters
+        params = {
+            'q': rephrased_query,
+            'format': 'json',
+            'time_range': time_range,
+            'language': language,
+            'category': category,
+            'engines': ','.join(engines),
+            'safesearch': safesearch
+        }
+        # Remove empty parameters
+        params = {k: v for k, v in params.items() if v != ""}
+        # If no engines are specified, set default engines
+        if 'engines' not in params:
+            params['engines'] = 'google'  # Default to 'google' or any preferred engine
+            logger.info("No engines specified. Defaulting to 'google'.")
+        # Headers for SearXNG request
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'application/json, text/javascript, */*; q=0.01',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Origin': 'https://shreyas094-searxng-local.hf.space',
+            'Referer': 'https://shreyas094-searxng-local.hf.space/',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Sec-Fetch-Dest': 'empty',
+            'Sec-Fetch-Mode': 'cors',
+            'Sec-Fetch-Site': 'same-origin',
+        }
         scraped_content = []
         page = 1
         while len(scraped_content) < num_results:
+            # Update params with current page
+            params['pageno'] = page
+            # Send request to SearXNG
+            logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
+            session = requests_retry_session()
+            try:
+                if method.upper() == "GET":
+                    response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
+                else:  # POST
+                    response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
+                response.raise_for_status()
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Error during SearXNG request: {e}")
+                return f"An error occurred during the search request: {e}"
+            search_results = response.json()
+            logger.debug(f"SearXNG Response: {search_results}")
+            results = search_results.get('results', [])
+            if not results:
+                logger.warning(f"No more results returned from SearXNG on page {page}.")
+                break
             for result in results:
                 if len(scraped_content) >= num_results:
                     break
+                url = result.get('url', '')
                 title = result.get('title', 'No title')
                 if not is_valid_url(url):
                 try:
                     logger.info(f"Processing content from: {url}")
+                    content = scrape_full_content(url, max_chars, timeout)
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
                         "title": title,
                         "url": url,
                         "content": content,
+                        "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper"
                     })
                     logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
                 except requests.exceptions.RequestException as e:
                 except Exception as e:
                     logger.error(f"Unexpected error while scraping {url}: {e}")
+            page += 1
         if not scraped_content:
             logger.warning("No content scraped from search results.")
         relevant_documents = []
         unique_summaries = []
         for doc in scraped_content:
             assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
             relevance, summary = assessment.split('\n', 1)
         if not relevant_documents:
             logger.warning("No relevant and unique documents found.")
             return "No relevant and unique financial news found for the given query."
         # Step 4: Rerank documents based on similarity to query
         reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
         # Step 5: Scrape full content for top documents (up to num_results)
         for doc in reranked_docs[:num_results]:
+            full_content = scrape_full_content(doc['url'], max_chars)
             doc['full_content'] = full_content
         # Prepare JSON for LLM
         return f"An unexpected error occurred during the search and scrape process: {e}"
+def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
     response = search_and_scrape(
         query=message,
         chat_history=chat_history,
         num_results=num_results,
         max_chars=max_chars,
         time_range=time_range,
         language=language,
         safesearch=safesearch,
         method=method,
         llm_temperature=llm_temperature,
         model=model
     )
     theme=gr.Theme.from_hub("allenai/gradio-theme"),
     additional_inputs=[
         gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
         gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
         gr.Dropdown(["", "day", "week", "month", "year"], value="", label="Time Range"),
         gr.Dropdown(["", "all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="", label="Language"),
         gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
         gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
         gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
         gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
     ],
     additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),