SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 5, 2024

Commit

b577b65

verified ·

1 Parent(s): f57b788

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -172

app.py CHANGED Viewed

@@ -27,10 +27,10 @@ from scrapy import signals
 from scrapy.signalmanager import dispatcher
 from scrapy.utils.log import configure_logging
 from newspaper import Article
-from io import BytesIO
 from PyPDF2 import PdfReader
-import logging
-import tempfile
 # Load environment variables from a .env file
@@ -82,62 +82,35 @@ def is_valid_url(url):
     except ValueError:
         return False
-class NewsSpider(scrapy.Spider):
-    name = 'news_spider'
-    def __init__(self, url=None, *args, **kwargs):
-        super(NewsSpider, self).__init__(*args, **kwargs)
-        self.start_urls = [url] if url else []
-    def parse(self, response):
-        content = ' '.join(response.css('p::text').getall())
-        self.logger.info(f"Scraped content length: {len(content)}")
-        return {'content': content}
-def scrape_with_scrapy(url, timeout=30):
-    logger.info(f"Starting to scrape with Scrapy: {url}")
-    configure_logging(install_root_handler=False)
-    logging.getLogger('scrapy').setLevel(logging.WARNING)
-    results = []
-    def spider_results(signal, sender, item, response, spider):
-        results.append(item)
-    process = CrawlerProcess(settings={
-        'LOG_ENABLED': True,
-        'LOG_LEVEL': 'WARNING',
-        'DOWNLOAD_TIMEOUT': timeout
-    })
-    dispatcher.connect(spider_results, signal=signals.item_scraped)
-    process.crawl(NewsSpider, url=url)
-    process.start()
-    # Get the content from results
-    if results:
-        return results[0]['content']
-    return ''
 def scrape_with_newspaper(url):
     logger.info(f"Starting to scrape with Newspaper3k: {url}")
     try:
-        # Check if the URL is a PDF
-        response = requests.get(url)
-        content_type = response.headers.get('Content-Type', '').lower()
-        if 'application/pdf' in content_type:
-            # Handle PDF
-            logger.info(f"Detected PDF file: {url}")
-            pdf_file = BytesIO(response.content)
-            pdf_reader = PdfReader(pdf_file)
-            text = ""
-            for page in pdf_reader.pages:
-                text += page.extract_text() + "\n"
-            return text.strip()
         else:
-            # Handle regular web page
             article = Article(url)
             article.download()
             article.parse()
@@ -146,68 +119,18 @@ def scrape_with_newspaper(url):
         logger.error(f"Error scraping {url} with Newspaper3k: {e}")
         return ""
-def scrape_with_bs4(url, session, max_chars=None):
     try:
-        response = session.get(url, timeout=5)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, 'html.parser')
-        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
-        if main_content:
-            content = main_content.get_text(strip=True, separator='\n')
         else:
-            content = soup.get_text(strip=True, separator='\n')
-        return content[:max_chars] if max_chars else content
-    except Exception as e:
-        logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
-        return ""
-def scrape_with_trafilatura(url, max_chars=None, timeout=5):
-    """
-    Scrape web content using Trafilatura with simplified error handling and fallback options.
-    Args:
-        url (str): The URL to scrape
-        max_chars (int, optional): Maximum number of characters to return
-        timeout (int, optional): Request timeout in seconds
-    Returns:
-        str: Extracted content or empty string if extraction fails
-    """
-    try:
-        # Make the request with timeout
-        response = requests.get(url, timeout=timeout)
-        response.raise_for_status()
-        # Extract content from the downloaded HTML
-        content = extract(
-            response.text,
-            include_comments=False,
-            include_tables=True,
-            no_fallback=False
-        )
-        # If first attempt fails, try direct URL extraction
-        if not content:
-            content = extract(
-                url,
-                include_comments=False,
-                include_tables=True,
-                no_fallback=False
-            )
-        # Return content with optional length limit
-        if content and max_chars:
-            return content[:max_chars]
-        return content or ""
-    except requests.Timeout:
-        logger.error(f"Timeout error while scraping {url}")
-        return ""
     except Exception as e:
-        logger.error(f"Error scraping {url}: {str(e)}")
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
@@ -343,65 +266,19 @@ Remember to focus on financial aspects and implications in your assessment and s
         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
-def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
-    """
-    Unified content scraper that supports multiple scraping methods.
-    Args:
-        url (str): The URL to scrape
-        scraper (str): Scraping method to use ('bs4', 'trafilatura', 'scrapy', 'newspaper')
-        max_chars (int): Maximum number of characters to return
-        timeout (int): Request timeout in seconds
-    Returns:
-        str: Scraped content or empty string if scraping fails
-    """
     try:
         logger.info(f"Scraping full content from: {url}")
-        content = ""
-        if scraper == "bs4":
-            session = requests_retry_session()
-            response = session.get(url, timeout=timeout)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.content, 'html.parser')
-            # Try to find the main content
-            main_content = (
-                soup.find('main') or
-                soup.find('article') or
-                soup.find('div', class_='content')
-            )
-            content = main_content.get_text(strip=True, separator='\n') if main_content else soup.get_text(strip=True, separator='\n')
-        elif scraper == "trafilatura":
-            content = scrape_with_trafilatura(url, max_chars, timeout)
-        elif scraper == "scrapy":
-            content = scrape_with_scrapy(url, timeout)
-        elif scraper == "newspaper":
-            article = Article(url)
-            article.download()
-            article.parse()
-            content = article.text
-        else:
-            logger.error(f"Unknown scraper: {scraper}")
-            return ""
-        # Standardize whitespace and limit content length
-        if content:
-            content = " ".join(content.split())  # Standardize whitespace
-            return content[:max_chars] if max_chars else content
-        return ""
-    except requests.Timeout:
-        logger.error(f"Timeout error while scraping {url}")
-        return ""
     except Exception as e:
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
@@ -445,7 +322,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
-def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
                       engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
     try:
         # Step 1: Rephrase the Query
@@ -532,8 +409,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
                 try:
                     logger.info(f"Scraping content from: {url}")
-                    # MODIFY: Remove the user agent loop and use a single scraping method
-                    content = scrape_full_content(url, scraper, max_chars, timeout)
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
@@ -542,8 +418,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
                     scraped_content.append({
                         "title": title,
                         "url": url,
-                        "content": content,
-                        "scraper": scraper
                     })
                     logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
                 except requests.exceptions.RequestException as e:

 from scrapy.signalmanager import dispatcher
 from scrapy.utils.log import configure_logging
 from newspaper import Article
+import html2text
 from PyPDF2 import PdfReader
+from io import BytesIO
 # Load environment variables from a .env file
     except ValueError:
         return False
+def is_pdf(url):
+    try:
+        response = requests.head(url, allow_redirects=True)
+        content_type = response.headers.get('Content-Type', '').lower()
+        return 'application/pdf' in content_type
+    except Exception as e:
+        logger.error(f"Error checking content type for {url}: {e}")
+        return False
+def scrape_pdf(url):
+    logger.info(f"Scraping PDF: {url}")
+    try:
+        response = requests.get(url)
+        pdf_file = BytesIO(response.content)
+        pdf_reader = PdfReader(pdf_file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text() + "\n"
+        return text.strip()
+    except Exception as e:
+        logger.error(f"Error scraping PDF {url}: {e}")
+        return ""
 def scrape_with_newspaper(url):
     logger.info(f"Starting to scrape with Newspaper3k: {url}")
     try:
+        if is_pdf(url):
+            return scrape_pdf(url)
         else:
             article = Article(url)
             article.download()
             article.parse()
         logger.error(f"Error scraping {url} with Newspaper3k: {e}")
         return ""
+def scrape_with_html2text(url):
+    logger.info(f"Starting to scrape with html2text: {url}")
     try:
+        if is_pdf(url):
+            return scrape_pdf(url)
         else:
+            response = requests.get(url)
+            h = html2text.HTML2Text()
+            h.ignore_links = True
+            return h.handle(response.text)
     except Exception as e:
+        logger.error(f"Error scraping {url} with html2text: {e}")
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
+def scrape_full_content(url, max_chars=3000):
     try:
         logger.info(f"Scraping full content from: {url}")
+        # Try newspaper first
+        content = scrape_with_newspaper(url)
+        # If newspaper fails, try html2text
+        if not content:
+            content = scrape_with_html2text(url)
+        # Limit the content to max_chars
+        return content[:max_chars] if content else ""
     except Exception as e:
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
+def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
                       engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
     try:
         # Step 1: Rephrase the Query
                 try:
                     logger.info(f"Scraping content from: {url}")
+                    content = scrape_full_content(url, max_chars, timeout)
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
                     scraped_content.append({
                         "title": title,
                         "url": url,
+                        "content": content
                     })
                     logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
                 except requests.exceptions.RequestException as e: