SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 5, 2024

Commit

f57b788

verified ·

1 Parent(s): 5067590

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -35

app.py CHANGED Viewed

@@ -124,52 +124,26 @@ def scrape_with_newspaper(url):
     logger.info(f"Starting to scrape with Newspaper3k: {url}")
     try:
         # Check if the URL is a PDF
-        response = requests.get(url, timeout=30)
         content_type = response.headers.get('Content-Type', '').lower()
         if 'application/pdf' in content_type:
             logger.info(f"Detected PDF file: {url}")
-            return extract_pdf_content(response.content)
         else:
             # Handle regular web page
             article = Article(url)
             article.download()
             article.parse()
             return article.text
-    except requests.RequestException as e:
-        logger.error(f"Error fetching content from {url}: {e}")
-    except Exception as e:
-        logger.error(f"Unexpected error scraping {url}: {e}")
-    # If we've reached this point, both methods have failed
-    logger.warning(f"All scraping methods failed for {url}")
-    return ""
-def extract_pdf_content(pdf_content):
-    try:
-        # First, try using PyPDF2 directly
-        pdf_file = BytesIO(pdf_content)
-        pdf_reader = PdfReader(pdf_file)
-        text = ""
-        for page in pdf_reader.pages:
-            text += page.extract_text() + "\n"
-        if text.strip():
-            return text.strip()
-        # If PyPDF2 fails to extract text, try saving the PDF and using newspaper
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
-            temp_pdf.write(pdf_content)
-            temp_pdf_path = temp_pdf.name
-        try:
-            article = Article('file://' + temp_pdf_path)
-            article.download()
-            article.parse()
-            return article.text
-        finally:
-            os.unlink(temp_pdf_path)  # Ensure we always delete the temporary file
     except Exception as e:
-        logger.error(f"Error extracting content from PDF: {e}")
         return ""
 def scrape_with_bs4(url, session, max_chars=None):

     logger.info(f"Starting to scrape with Newspaper3k: {url}")
     try:
         # Check if the URL is a PDF
+        response = requests.get(url)
         content_type = response.headers.get('Content-Type', '').lower()
         if 'application/pdf' in content_type:
+            # Handle PDF
             logger.info(f"Detected PDF file: {url}")
+            pdf_file = BytesIO(response.content)
+            pdf_reader = PdfReader(pdf_file)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text() + "\n"
+            return text.strip()
         else:
             # Handle regular web page
             article = Article(url)
             article.download()
             article.parse()
             return article.text
     except Exception as e:
+        logger.error(f"Error scraping {url} with Newspaper3k: {e}")
         return ""
 def scrape_with_bs4(url, session, max_chars=None):