Shreyas094
commited on
Commit
•
6c48447
1
Parent(s):
c6a0be6
Update app.py
Browse files
app.py
CHANGED
@@ -27,6 +27,9 @@ from scrapy import signals
|
|
27 |
from scrapy.signalmanager import dispatcher
|
28 |
from scrapy.utils.log import configure_logging
|
29 |
from newspaper import Article
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
|
@@ -119,10 +122,29 @@ def scrape_with_scrapy(url, timeout=30):
|
|
119 |
|
120 |
def scrape_with_newspaper(url):
|
121 |
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
def scrape_with_bs4(url, session, max_chars=None):
|
128 |
try:
|
|
|
27 |
from scrapy.signalmanager import dispatcher
|
28 |
from scrapy.utils.log import configure_logging
|
29 |
from newspaper import Article
|
30 |
+
from io import BytesIO
|
31 |
+
from PyPDF2 import PdfReader
|
32 |
+
import logging
|
33 |
|
34 |
|
35 |
|
|
|
122 |
|
123 |
def scrape_with_newspaper(url):
|
124 |
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
125 |
+
try:
|
126 |
+
# Check if the URL is a PDF
|
127 |
+
response = requests.get(url)
|
128 |
+
content_type = response.headers.get('Content-Type', '').lower()
|
129 |
+
|
130 |
+
if 'application/pdf' in content_type:
|
131 |
+
# Handle PDF
|
132 |
+
logger.info(f"Detected PDF file: {url}")
|
133 |
+
pdf_file = BytesIO(response.content)
|
134 |
+
pdf_reader = PdfReader(pdf_file)
|
135 |
+
text = ""
|
136 |
+
for page in pdf_reader.pages:
|
137 |
+
text += page.extract_text() + "\n"
|
138 |
+
return text.strip()
|
139 |
+
else:
|
140 |
+
# Handle regular web page
|
141 |
+
article = Article(url)
|
142 |
+
article.download()
|
143 |
+
article.parse()
|
144 |
+
return article.text
|
145 |
+
except Exception as e:
|
146 |
+
logger.error(f"Error scraping {url} with Newspaper3k: {e}")
|
147 |
+
return ""
|
148 |
|
149 |
def scrape_with_bs4(url, session, max_chars=None):
|
150 |
try:
|