SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 5, 2024

Commit

a5594d9

verified ·

1 Parent(s): 7954811

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -49

app.py CHANGED Viewed

@@ -27,11 +27,9 @@ from scrapy import signals
 from scrapy.signalmanager import dispatcher
 from scrapy.utils.log import configure_logging
 from newspaper import Article
-import html2text
-from PyPDF2 import PdfReader
-from io import BytesIO
 # Load environment variables from a .env file
 load_dotenv()
@@ -82,55 +80,148 @@ def is_valid_url(url):
     except ValueError:
         return False
-def is_pdf(url):
-    try:
-        response = requests.head(url, allow_redirects=True)
-        content_type = response.headers.get('Content-Type', '').lower()
-        return 'application/pdf' in content_type
-    except Exception as e:
-        logger.error(f"Error checking content type for {url}: {e}")
-        return False
-def scrape_pdf(url):
-    logger.info(f"Scraping PDF: {url}")
     try:
-        response = requests.get(url)
-        pdf_file = BytesIO(response.content)
-        pdf_reader = PdfReader(pdf_file)
-        text = ""
         for page in pdf_reader.pages:
-            text += page.extract_text() + "\n"
-        return text.strip()
     except Exception as e:
-        logger.error(f"Error scraping PDF {url}: {e}")
         return ""
 def scrape_with_newspaper(url):
     logger.info(f"Starting to scrape with Newspaper3k: {url}")
     try:
-        if is_pdf(url):
-            return scrape_pdf(url)
-        else:
-            article = Article(url)
-            article.download()
-            article.parse()
-            return article.text
     except Exception as e:
         logger.error(f"Error scraping {url} with Newspaper3k: {e}")
         return ""
-def scrape_with_html2text(url):
-    logger.info(f"Starting to scrape with html2text: {url}")
     try:
-        if is_pdf(url):
-            return scrape_pdf(url)
         else:
-            response = requests.get(url)
-            h = html2text.HTML2Text()
-            h.ignore_links = True
-            return h.handle(response.text)
     except Exception as e:
-        logger.error(f"Error scraping {url} with html2text: {e}")
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
@@ -266,19 +357,42 @@ Remember to focus on financial aspects and implications in your assessment and s
         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
-def scrape_full_content(url, max_chars=3000):
     try:
         logger.info(f"Scraping full content from: {url}")
-        # Try newspaper first
-        content = scrape_with_newspaper(url)
-        # If newspaper fails, try html2text
-        if not content:
-            content = scrape_with_html2text(url)
         # Limit the content to max_chars
         return content[:max_chars] if content else ""
     except Exception as e:
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
@@ -322,7 +436,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
-def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
                       engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
     try:
         # Step 1: Rephrase the Query
@@ -407,9 +521,9 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
                     continue
                 try:
-                    logger.info(f"Scraping content from: {url}")
-                    content = scrape_full_content(url, max_chars, timeout)
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
@@ -418,7 +532,8 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
                     scraped_content.append({
                         "title": title,
                         "url": url,
-                        "content": content
                     })
                     logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
                 except requests.exceptions.RequestException as e:

 from scrapy.signalmanager import dispatcher
 from scrapy.utils.log import configure_logging
 from newspaper import Article
+import PyPDF2
+import io
+import requests
 # Load environment variables from a .env file
 load_dotenv()
     except ValueError:
         return False
+def scrape_pdf_content(url, max_chars=3000, timeout=5):
     try:
+        logger.info(f"Scraping PDF content from: {url}")
+        # Download the PDF file
+        response = requests.get(url, timeout=timeout)
+        response.raise_for_status()
+        # Create a PDF reader object
+        pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
+        # Extract text from all pages
+        content = ""
         for page in pdf_reader.pages:
+            content += page.extract_text() + "\n"
+        # Limit the content to max_chars
+        return content[:max_chars] if content else ""
+    except requests.Timeout:
+        logger.error(f"Timeout error while scraping PDF content from {url}")
+        return ""
     except Exception as e:
+        logger.error(f"Error scraping PDF content from {url}: {e}")
         return ""
+class NewsSpider(scrapy.Spider):
+    name = 'news_spider'
+    def __init__(self, url=None, *args, **kwargs):
+        super(NewsSpider, self).__init__(*args, **kwargs)
+        self.start_urls = [url] if url else []
+    def parse(self, response):
+        content = ' '.join(response.css('p::text').getall())
+        self.logger.info(f"Scraped content length: {len(content)}")
+        return {'content': content}
+def scrape_with_scrapy(url, timeout=30):
+    logger.info(f"Starting to scrape with Scrapy: {url}")
+    configure_logging(install_root_handler=False)
+    logging.getLogger('scrapy').setLevel(logging.WARNING)
+    results = []
+    def spider_results(signal, sender, item, response, spider):
+        results.append(item)
+    process = CrawlerProcess(settings={
+        'LOG_ENABLED': True,
+        'LOG_LEVEL': 'WARNING',
+        'DOWNLOAD_TIMEOUT': timeout
+    })
+    dispatcher.connect(spider_results, signal=signals.item_scraped)
+    process.crawl(NewsSpider, url=url)
+    process.start()
+    # Get the content from results
+    if results:
+        return results[0]['content']
+    return ''
 def scrape_with_newspaper(url):
+    if url.lower().endswith('.pdf'):
+        return scrape_pdf_content(url)
     logger.info(f"Starting to scrape with Newspaper3k: {url}")
     try:
+        article = Article(url)
+        article.download()
+        article.parse()
+        # Combine title and text
+        content = f"Title: {article.title}\n\n"
+        content += article.text
+        # Add publish date if available
+        if article.publish_date:
+            content += f"\n\nPublish Date: {article.publish_date}"
+        # Add authors if available
+        if article.authors:
+            content += f"\n\nAuthors: {', '.join(article.authors)}"
+        # Add top image URL if available
+        if article.top_image:
+            content += f"\n\nTop Image URL: {article.top_image}"
+        return content
     except Exception as e:
         logger.error(f"Error scraping {url} with Newspaper3k: {e}")
         return ""
+def scrape_with_bs4(url, session, max_chars=None):
     try:
+        response = session.get(url, timeout=5)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
+        if main_content:
+            content = main_content.get_text(strip=True, separator='\n')
         else:
+            content = soup.get_text(strip=True, separator='\n')
+        return content[:max_chars] if max_chars else content
+    except Exception as e:
+        logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
+        return ""
+def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
+    try:
+        response = requests.get(url, timeout=timeout)
+        response.raise_for_status()
+        downloaded = response.text
+        content = ""
+        if use_beautifulsoup:
+            soup = BeautifulSoup(downloaded, "lxml")
+            # Convert BeautifulSoup object to a string
+            html_string = str(soup)
+            # Use Trafilatura's extract function directly on the HTML string
+            content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
+        # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
+        if not content and use_beautifulsoup:
+            logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
+            content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
+        # If still no content, use the URL directly
+        if not content:
+            content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
+        return (content or "")[:max_chars] if max_chars else (content or "")
+    except requests.Timeout:
+        logger.error(f"Timeout error while scraping {url} with Trafilatura")
+        return ""
     except Exception as e:
+        logger.error(f"Error scraping {url} with Trafilatura: {e}")
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
+def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
     try:
         logger.info(f"Scraping full content from: {url}")
+        # Check if the URL ends with .pdf
+        if url.lower().endswith('.pdf'):
+            return scrape_pdf_content(url, max_chars, timeout)
+        if scraper == "bs4":
+            session = requests_retry_session()
+            response = session.get(url, timeout=timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Try to find the main content
+            main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
+            if main_content:
+                content = main_content.get_text(strip=True, separator='\n')
+            else:
+                content = soup.get_text(strip=True, separator='\n')
+        elif scraper == "trafilatura":
+            content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
+        elif scraper == "scrapy":
+            content = scrape_with_scrapy(url, timeout)
+        elif scraper == "newspaper":
+            content = scrape_with_newspaper(url)
+        else:
+            logger.error(f"Unknown scraper: {scraper}")
+            return ""
         # Limit the content to max_chars
         return content[:max_chars] if content else ""
+    except requests.Timeout:
+        logger.error(f"Timeout error while scraping full content from {url}")
+        return ""
     except Exception as e:
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
+def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
                       engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
     try:
         # Step 1: Rephrase the Query
                     continue
                 try:
+                    logger.info(f"Processing content from: {url}")
+                    content = scrape_full_content(url, scraper, max_chars, timeout)
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
                     scraped_content.append({
                         "title": title,
                         "url": url,
+                        "content": content,
+                        "scraper": "pdf" if url.lower().endswith('.pdf') else scraper
                     })
                     logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
                 except requests.exceptions.RequestException as e: