SearXNG-WebSearch-Agent

Running

Shreyas094 commited on Oct 5, 2024

Commit

6c48447

verified ·

1 Parent(s): c6a0be6

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,6 +27,9 @@ from scrapy import signals
 from scrapy.signalmanager import dispatcher
 from scrapy.utils.log import configure_logging
 from newspaper import Article
@@ -119,10 +122,29 @@ def scrape_with_scrapy(url, timeout=30):
 def scrape_with_newspaper(url):
     logger.info(f"Starting to scrape with Newspaper3k: {url}")
-    article = Article(url)
-    article.download()
-    article.parse()
-    return article.text
 def scrape_with_bs4(url, session, max_chars=None):
     try:

 from scrapy.signalmanager import dispatcher
 from scrapy.utils.log import configure_logging
 from newspaper import Article
+from io import BytesIO
+from PyPDF2 import PdfReader
+import logging
 def scrape_with_newspaper(url):
     logger.info(f"Starting to scrape with Newspaper3k: {url}")
+    try:
+        # Check if the URL is a PDF
+        response = requests.get(url)
+        content_type = response.headers.get('Content-Type', '').lower()
+        if 'application/pdf' in content_type:
+            # Handle PDF
+            logger.info(f"Detected PDF file: {url}")
+            pdf_file = BytesIO(response.content)
+            pdf_reader = PdfReader(pdf_file)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text() + "\n"
+            return text.strip()
+        else:
+            # Handle regular web page
+            article = Article(url)
+            article.download()
+            article.parse()
+            return article.text
+    except Exception as e:
+        logger.error(f"Error scraping {url} with Newspaper3k: {e}")
+        return ""
 def scrape_with_bs4(url, session, max_chars=None):
     try: