Shreyas094 commited on
Commit
f57b788
1 Parent(s): 5067590

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -35
app.py CHANGED
@@ -124,52 +124,26 @@ def scrape_with_newspaper(url):
124
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
125
  try:
126
  # Check if the URL is a PDF
127
- response = requests.get(url, timeout=30)
128
  content_type = response.headers.get('Content-Type', '').lower()
129
 
130
  if 'application/pdf' in content_type:
 
131
  logger.info(f"Detected PDF file: {url}")
132
- return extract_pdf_content(response.content)
 
 
 
 
 
133
  else:
134
  # Handle regular web page
135
  article = Article(url)
136
  article.download()
137
  article.parse()
138
  return article.text
139
- except requests.RequestException as e:
140
- logger.error(f"Error fetching content from {url}: {e}")
141
- except Exception as e:
142
- logger.error(f"Unexpected error scraping {url}: {e}")
143
-
144
- # If we've reached this point, both methods have failed
145
- logger.warning(f"All scraping methods failed for {url}")
146
- return ""
147
-
148
- def extract_pdf_content(pdf_content):
149
- try:
150
- # First, try using PyPDF2 directly
151
- pdf_file = BytesIO(pdf_content)
152
- pdf_reader = PdfReader(pdf_file)
153
- text = ""
154
- for page in pdf_reader.pages:
155
- text += page.extract_text() + "\n"
156
- if text.strip():
157
- return text.strip()
158
-
159
- # If PyPDF2 fails to extract text, try saving the PDF and using newspaper
160
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
161
- temp_pdf.write(pdf_content)
162
- temp_pdf_path = temp_pdf.name
163
-
164
- try:
165
- article = Article('file://' + temp_pdf_path)
166
- article.download()
167
- article.parse()
168
- return article.text
169
- finally:
170
- os.unlink(temp_pdf_path) # Ensure we always delete the temporary file
171
  except Exception as e:
172
- logger.error(f"Error extracting content from PDF: {e}")
173
  return ""
174
 
175
  def scrape_with_bs4(url, session, max_chars=None):
 
124
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
125
  try:
126
  # Check if the URL is a PDF
127
+ response = requests.get(url)
128
  content_type = response.headers.get('Content-Type', '').lower()
129
 
130
  if 'application/pdf' in content_type:
131
+ # Handle PDF
132
  logger.info(f"Detected PDF file: {url}")
133
+ pdf_file = BytesIO(response.content)
134
+ pdf_reader = PdfReader(pdf_file)
135
+ text = ""
136
+ for page in pdf_reader.pages:
137
+ text += page.extract_text() + "\n"
138
+ return text.strip()
139
  else:
140
  # Handle regular web page
141
  article = Article(url)
142
  article.download()
143
  article.parse()
144
  return article.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  except Exception as e:
146
+ logger.error(f"Error scraping {url} with Newspaper3k: {e}")
147
  return ""
148
 
149
  def scrape_with_bs4(url, session, max_chars=None):