Shreyas094 commited on
Commit
6c48447
1 Parent(s): c6a0be6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -4
app.py CHANGED
@@ -27,6 +27,9 @@ from scrapy import signals
27
  from scrapy.signalmanager import dispatcher
28
  from scrapy.utils.log import configure_logging
29
  from newspaper import Article
 
 
 
30
 
31
 
32
 
@@ -119,10 +122,29 @@ def scrape_with_scrapy(url, timeout=30):
119
 
120
  def scrape_with_newspaper(url):
121
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
122
- article = Article(url)
123
- article.download()
124
- article.parse()
125
- return article.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  def scrape_with_bs4(url, session, max_chars=None):
128
  try:
 
27
  from scrapy.signalmanager import dispatcher
28
  from scrapy.utils.log import configure_logging
29
  from newspaper import Article
30
+ from io import BytesIO
31
+ from PyPDF2 import PdfReader
32
+ import logging
33
 
34
 
35
 
 
122
 
123
  def scrape_with_newspaper(url):
124
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
125
+ try:
126
+ # Check if the URL is a PDF
127
+ response = requests.get(url)
128
+ content_type = response.headers.get('Content-Type', '').lower()
129
+
130
+ if 'application/pdf' in content_type:
131
+ # Handle PDF
132
+ logger.info(f"Detected PDF file: {url}")
133
+ pdf_file = BytesIO(response.content)
134
+ pdf_reader = PdfReader(pdf_file)
135
+ text = ""
136
+ for page in pdf_reader.pages:
137
+ text += page.extract_text() + "\n"
138
+ return text.strip()
139
+ else:
140
+ # Handle regular web page
141
+ article = Article(url)
142
+ article.download()
143
+ article.parse()
144
+ return article.text
145
+ except Exception as e:
146
+ logger.error(f"Error scraping {url} with Newspaper3k: {e}")
147
+ return ""
148
 
149
  def scrape_with_bs4(url, session, max_chars=None):
150
  try: