Shreyas094
commited on
Commit
•
f57b788
1
Parent(s):
5067590
Update app.py
Browse files
app.py
CHANGED
@@ -124,52 +124,26 @@ def scrape_with_newspaper(url):
|
|
124 |
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
125 |
try:
|
126 |
# Check if the URL is a PDF
|
127 |
-
response = requests.get(url
|
128 |
content_type = response.headers.get('Content-Type', '').lower()
|
129 |
|
130 |
if 'application/pdf' in content_type:
|
|
|
131 |
logger.info(f"Detected PDF file: {url}")
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
133 |
else:
|
134 |
# Handle regular web page
|
135 |
article = Article(url)
|
136 |
article.download()
|
137 |
article.parse()
|
138 |
return article.text
|
139 |
-
except requests.RequestException as e:
|
140 |
-
logger.error(f"Error fetching content from {url}: {e}")
|
141 |
-
except Exception as e:
|
142 |
-
logger.error(f"Unexpected error scraping {url}: {e}")
|
143 |
-
|
144 |
-
# If we've reached this point, both methods have failed
|
145 |
-
logger.warning(f"All scraping methods failed for {url}")
|
146 |
-
return ""
|
147 |
-
|
148 |
-
def extract_pdf_content(pdf_content):
|
149 |
-
try:
|
150 |
-
# First, try using PyPDF2 directly
|
151 |
-
pdf_file = BytesIO(pdf_content)
|
152 |
-
pdf_reader = PdfReader(pdf_file)
|
153 |
-
text = ""
|
154 |
-
for page in pdf_reader.pages:
|
155 |
-
text += page.extract_text() + "\n"
|
156 |
-
if text.strip():
|
157 |
-
return text.strip()
|
158 |
-
|
159 |
-
# If PyPDF2 fails to extract text, try saving the PDF and using newspaper
|
160 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
161 |
-
temp_pdf.write(pdf_content)
|
162 |
-
temp_pdf_path = temp_pdf.name
|
163 |
-
|
164 |
-
try:
|
165 |
-
article = Article('file://' + temp_pdf_path)
|
166 |
-
article.download()
|
167 |
-
article.parse()
|
168 |
-
return article.text
|
169 |
-
finally:
|
170 |
-
os.unlink(temp_pdf_path) # Ensure we always delete the temporary file
|
171 |
except Exception as e:
|
172 |
-
logger.error(f"Error
|
173 |
return ""
|
174 |
|
175 |
def scrape_with_bs4(url, session, max_chars=None):
|
|
|
124 |
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
125 |
try:
|
126 |
# Check if the URL is a PDF
|
127 |
+
response = requests.get(url)
|
128 |
content_type = response.headers.get('Content-Type', '').lower()
|
129 |
|
130 |
if 'application/pdf' in content_type:
|
131 |
+
# Handle PDF
|
132 |
logger.info(f"Detected PDF file: {url}")
|
133 |
+
pdf_file = BytesIO(response.content)
|
134 |
+
pdf_reader = PdfReader(pdf_file)
|
135 |
+
text = ""
|
136 |
+
for page in pdf_reader.pages:
|
137 |
+
text += page.extract_text() + "\n"
|
138 |
+
return text.strip()
|
139 |
else:
|
140 |
# Handle regular web page
|
141 |
article = Article(url)
|
142 |
article.download()
|
143 |
article.parse()
|
144 |
return article.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
except Exception as e:
|
146 |
+
logger.error(f"Error scraping {url} with Newspaper3k: {e}")
|
147 |
return ""
|
148 |
|
149 |
def scrape_with_bs4(url, session, max_chars=None):
|