SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 1, 2024

Commit

84a4885

•

1 Parent(s): ef24902

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -75

app.py CHANGED Viewed

@@ -16,8 +16,6 @@ from datetime import datetime
 import os
 from dotenv import load_dotenv
 import certifi
-import random
-from tenacity import retry, stop_after_attempt, wait_exponential
 # Load environment variables from a .env file
 load_dotenv()
@@ -68,34 +66,12 @@ def is_valid_url(url):
     except ValueError:
         return False
-class ScrapingError(Exception):
-    def __init__(self, message, status_code=None):
-        self.message = message
-        self.status_code = status_code
-        super().__init__(self.message)
-def get_random_user_agent(include_searx=False):
-    user_agents = [
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
-        # Add more user agents...
-    ]
-    searx_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-    if include_searx:
-        return searx_agent
-    else:
-        return random.choice(user_agents)
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
 def scrape_with_bs4(url, session):
     try:
-        headers = {'User-Agent': get_random_user_agent()}
-        response = session.get(url, timeout=15, headers=headers)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
         main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
         if main_content:
@@ -103,39 +79,19 @@ def scrape_with_bs4(url, session):
         else:
             content = soup.get_text(strip=True)
-        return {'success': True, 'content': content}
-    except requests.exceptions.HTTPError as e:
-        if e.response.status_code == 403:
-            logger.warning(f"403 Forbidden error for {url}. Retrying with backoff.")
-            raise ScrapingError("403 Forbidden", status_code=403)
-        logger.error(f"HTTP error scraping {url}: {e}")
-        return {'success': False, 'error': str(e), 'status_code': e.response.status_code}
-    except requests.exceptions.Timeout:
-        logger.error(f"Timeout error scraping {url}")
-        return {'success': False, 'error': 'Timeout'}
-    except requests.exceptions.ConnectionError:
-        logger.error(f"Connection error scraping {url}")
-        return {'success': False, 'error': 'Connection Error'}
     except Exception as e:
-        logger.error(f"Unexpected error scraping {url}: {e}")
-        return {'success': False, 'error': str(e)}
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
 def scrape_with_trafilatura(url):
     try:
-        downloaded = fetch_url(url)  # Remove the timeout parameter
-        if downloaded is None:
-            raise ScrapingError("Failed to download content")
         content = extract(downloaded)
-        if content is None:
-            raise ScrapingError("Failed to extract content")
-        return {'success': True, 'content': content}
-    except ScrapingError as e:
-        logger.error(f"Scraping error for {url}: {e}")
-        return {'success': False, 'error': str(e)}
     except Exception as e:
-        logger.error(f"Unexpected error scraping {url} with Trafilatura: {e}")
-        return {'success': False, 'error': str(e)}
 def rephrase_query(chat_history, query, temperature=0.2):
     system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
@@ -296,11 +252,6 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
-def rate_limited_scraping(url, scraper_func, *args, **kwargs):
-    time.sleep(random.uniform(1, 3))  # Random delay between 1-3 seconds
-    return scraper_func(url, *args, **kwargs)
 def llm_summarize(query, documents, llm_client, temperature=0.2):
     system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
@@ -378,7 +329,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
         # Headers for SearXNG request
         headers = {
-            'User-Agent': get_random_user_agent(include_searx=True),
             'Accept': 'application/json, text/javascript, */*; q=0.01',
             'Accept-Language': 'en-US,en;q=0.5',
             'Origin': 'https://shreyas094-searxng-local.hf.space',
@@ -415,7 +366,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
             logger.warning("No results returned from SearXNG.")
             return "No results found for the given query."
-        scraped_content = []
         for result in search_results.get('results', [])[:num_results]:
             url = result.get('url', '')
@@ -428,24 +379,41 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
             try:
                 logger.info(f"Scraping content from: {url}")
-                if scraper == "bs4":
-                    content = scrape_with_bs4(url, session)
-                else:  # trafilatura
-                    content = scrape_with_trafilatura(url)
-                # Handle different types of content and limit to max_chars
-                if isinstance(content, dict) and 'content' in content:
-                    content['content'] = content['content'][:max_chars]
-                elif isinstance(content, str):
-                    content = content[:max_chars]
-                else:
-                    logger.warning(f"Unexpected content type for URL: {url}")
-                    content = str(content)[:max_chars]
                 scraped_content.append({
                     "title": title,
                     "url": url,
-                    "content": content,
                     "scraper": scraper
                 })
             except requests.exceptions.RequestException as e:
@@ -561,4 +529,4 @@ iface = gr.ChatInterface(
 if __name__ == "__main__":
     logger.info("Starting the SearXNG Scraper for Financial News using ChatInterface with Advanced Parameters")
-    iface.launch(share=True)

 import os
 from dotenv import load_dotenv
 import certifi
 # Load environment variables from a .env file
 load_dotenv()
     except ValueError:
         return False
 def scrape_with_bs4(url, session):
     try:
+        response = session.get(url, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
         main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
         if main_content:
         else:
             content = soup.get_text(strip=True)
+        return content
     except Exception as e:
+        logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
+        return ""
 def scrape_with_trafilatura(url):
     try:
+        downloaded = fetch_url(url)
         content = extract(downloaded)
+        return content or ""
     except Exception as e:
+        logger.error(f"Error scraping {url} with Trafilatura: {e}")
+        return ""
 def rephrase_query(chat_history, query, temperature=0.2):
     system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
 def llm_summarize(query, documents, llm_client, temperature=0.2):
     system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
         # Headers for SearXNG request
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Accept': 'application/json, text/javascript, */*; q=0.01',
             'Accept-Language': 'en-US,en;q=0.5',
             'Origin': 'https://shreyas094-searxng-local.hf.space',
             logger.warning("No results returned from SearXNG.")
             return "No results found for the given query."
+         scraped_content = []
         for result in search_results.get('results', [])[:num_results]:
             url = result.get('url', '')
             try:
                 logger.info(f"Scraping content from: {url}")
+                # Implement a retry mechanism with different user agents
+                user_agents = [
+                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
+                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+                ]
+                content = ""
+                for ua in user_agents:
+                    try:
+                        if scraper == "bs4":
+                            session.headers.update({'User-Agent': ua})
+                            content = scrape_with_bs4(url, session)
+                        else:  # trafilatura
+                            downloaded = fetch_url(url, headers={'User-Agent': ua})
+                            content = extract(downloaded)
+                        if content:
+                            break
+                    except requests.exceptions.HTTPError as e:
+                        if e.response.status_code == 403:
+                            logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
+                            continue
+                        else:
+                            raise
+                if not content:
+                    logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
+                    continue
+                # Limit content to max_chars
                 scraped_content.append({
                     "title": title,
                     "url": url,
+                    "content": content[:max_chars],
                     "scraper": scraper
                 })
             except requests.exceptions.RequestException as e:
 if __name__ == "__main__":
     logger.info("Starting the SearXNG Scraper for Financial News using ChatInterface with Advanced Parameters")
+    iface.launch(share=True)