SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 1, 2024

Commit

3817f14

verified ·

1 Parent(s): c17888a

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -10

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datetime import datetime
 import os
 from dotenv import load_dotenv
 import certifi
 # Load environment variables from a .env file
 load_dotenv()
@@ -66,12 +67,34 @@ def is_valid_url(url):
     except ValueError:
         return False
 def scrape_with_bs4(url, session):
     try:
-        response = session.get(url, timeout=10)
         response.raise_for_status()
-        soup = BeautifulSoup(response.content, 'html.parser')
         main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
         if main_content:
@@ -79,19 +102,39 @@ def scrape_with_bs4(url, session):
         else:
             content = soup.get_text(strip=True)
-        return content
     except Exception as e:
-        logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
-        return ""
 def scrape_with_trafilatura(url):
     try:
-        downloaded = fetch_url(url)
         content = extract(downloaded)
-        return content or ""
     except Exception as e:
-        logger.error(f"Error scraping {url} with Trafilatura: {e}")
-        return ""
 def rephrase_query(chat_history, query, temperature=0.2):
     system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
@@ -252,6 +295,11 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
 def llm_summarize(query, documents, llm_client, temperature=0.2):
     system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
@@ -329,7 +377,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
         # Headers for SearXNG request
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Accept': 'application/json, text/javascript, */*; q=0.01',
             'Accept-Language': 'en-US,en;q=0.5',
             'Origin': 'https://shreyas094-searxng-local.hf.space',

 import os
 from dotenv import load_dotenv
 import certifi
+import random
 # Load environment variables from a .env file
 load_dotenv()
     except ValueError:
         return False
+class ScrapingError(Exception):
+    def __init__(self, message, status_code=None):
+        self.message = message
+        self.status_code = status_code
+        super().__init__(self.message)
+def get_random_user_agent(include_searx=False):
+    user_agents = [
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
+        # Add more user agents...
+    ]
+    searx_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    if include_searx:
+        return searx_agent
+    else:
+        return random.choice(user_agents)
+@retry(stop=stop_after_attempt(1), wait=wait_exponential(multiplier=1, min=4, max=10))
 def scrape_with_bs4(url, session):
     try:
+        headers = {'User-Agent': get_random_user_agent()}
+        response = session.get(url, timeout=15, headers=headers)
         response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
         main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
         if main_content:
         else:
             content = soup.get_text(strip=True)
+        return {'success': True, 'content': content}
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 403:
+            logger.warning(f"403 Forbidden error for {url}. Retrying with backoff.")
+            raise ScrapingError("403 Forbidden", status_code=403)
+        logger.error(f"HTTP error scraping {url}: {e}")
+        return {'success': False, 'error': str(e), 'status_code': e.response.status_code}
+    except requests.exceptions.Timeout:
+        logger.error(f"Timeout error scraping {url}")
+        return {'success': False, 'error': 'Timeout'}
+    except requests.exceptions.ConnectionError:
+        logger.error(f"Connection error scraping {url}")
+        return {'success': False, 'error': 'Connection Error'}
     except Exception as e:
+        logger.error(f"Unexpected error scraping {url}: {e}")
+        return {'success': False, 'error': str(e)}
+@retry(stop=stop_after_attempt(1), wait=wait_exponential(multiplier=1, min=4, max=10))
 def scrape_with_trafilatura(url):
     try:
+        downloaded = fetch_url(url, timeout=10)
+        if downloaded is None:
+            raise ScrapingError("Failed to download content")
         content = extract(downloaded)
+        if content is None:
+            raise ScrapingError("Failed to extract content")
+        return {'success': True, 'content': content}
+    except ScrapingError as e:
+        logger.error(f"Scraping error for {url}: {e}")
+        return {'success': False, 'error': str(e)}
     except Exception as e:
+        logger.error(f"Unexpected error scraping {url} with Trafilatura: {e}")
+        return {'success': False, 'error': str(e)}
 def rephrase_query(chat_history, query, temperature=0.2):
     system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
+def rate_limited_scraping(url, scraper_func, *args, **kwargs):
+    time.sleep(random.uniform(1, 3))  # Random delay between 1-3 seconds
+    return scraper_func(url, *args, **kwargs)
 def llm_summarize(query, documents, llm_client, temperature=0.2):
     system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
         # Headers for SearXNG request
         headers = {
+            'User-Agent': get_random_user_agent(include_searx=True),
             'Accept': 'application/json, text/javascript, */*; q=0.01',
             'Accept-Language': 'en-US,en;q=0.5',
             'Origin': 'https://shreyas094-searxng-local.hf.space',