Shreyas094
commited on
Commit
•
b577b65
1
Parent(s):
f57b788
Update app.py
Browse files
app.py
CHANGED
@@ -27,10 +27,10 @@ from scrapy import signals
|
|
27 |
from scrapy.signalmanager import dispatcher
|
28 |
from scrapy.utils.log import configure_logging
|
29 |
from newspaper import Article
|
30 |
-
|
31 |
from PyPDF2 import PdfReader
|
32 |
-
import
|
33 |
-
|
34 |
|
35 |
|
36 |
# Load environment variables from a .env file
|
@@ -82,62 +82,35 @@ def is_valid_url(url):
|
|
82 |
except ValueError:
|
83 |
return False
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
process = CrawlerProcess(settings={
|
108 |
-
'LOG_ENABLED': True,
|
109 |
-
'LOG_LEVEL': 'WARNING',
|
110 |
-
'DOWNLOAD_TIMEOUT': timeout
|
111 |
-
})
|
112 |
-
|
113 |
-
dispatcher.connect(spider_results, signal=signals.item_scraped)
|
114 |
-
|
115 |
-
process.crawl(NewsSpider, url=url)
|
116 |
-
process.start()
|
117 |
-
|
118 |
-
# Get the content from results
|
119 |
-
if results:
|
120 |
-
return results[0]['content']
|
121 |
-
return ''
|
122 |
|
123 |
def scrape_with_newspaper(url):
|
124 |
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
125 |
try:
|
126 |
-
|
127 |
-
|
128 |
-
content_type = response.headers.get('Content-Type', '').lower()
|
129 |
-
|
130 |
-
if 'application/pdf' in content_type:
|
131 |
-
# Handle PDF
|
132 |
-
logger.info(f"Detected PDF file: {url}")
|
133 |
-
pdf_file = BytesIO(response.content)
|
134 |
-
pdf_reader = PdfReader(pdf_file)
|
135 |
-
text = ""
|
136 |
-
for page in pdf_reader.pages:
|
137 |
-
text += page.extract_text() + "\n"
|
138 |
-
return text.strip()
|
139 |
else:
|
140 |
-
# Handle regular web page
|
141 |
article = Article(url)
|
142 |
article.download()
|
143 |
article.parse()
|
@@ -146,68 +119,18 @@ def scrape_with_newspaper(url):
|
|
146 |
logger.error(f"Error scraping {url} with Newspaper3k: {e}")
|
147 |
return ""
|
148 |
|
149 |
-
def
|
|
|
150 |
try:
|
151 |
-
|
152 |
-
|
153 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
154 |
-
|
155 |
-
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
156 |
-
|
157 |
-
if main_content:
|
158 |
-
content = main_content.get_text(strip=True, separator='\n')
|
159 |
else:
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
165 |
-
return ""
|
166 |
-
|
167 |
-
def scrape_with_trafilatura(url, max_chars=None, timeout=5):
|
168 |
-
"""
|
169 |
-
Scrape web content using Trafilatura with simplified error handling and fallback options.
|
170 |
-
|
171 |
-
Args:
|
172 |
-
url (str): The URL to scrape
|
173 |
-
max_chars (int, optional): Maximum number of characters to return
|
174 |
-
timeout (int, optional): Request timeout in seconds
|
175 |
-
|
176 |
-
Returns:
|
177 |
-
str: Extracted content or empty string if extraction fails
|
178 |
-
"""
|
179 |
-
try:
|
180 |
-
# Make the request with timeout
|
181 |
-
response = requests.get(url, timeout=timeout)
|
182 |
-
response.raise_for_status()
|
183 |
-
|
184 |
-
# Extract content from the downloaded HTML
|
185 |
-
content = extract(
|
186 |
-
response.text,
|
187 |
-
include_comments=False,
|
188 |
-
include_tables=True,
|
189 |
-
no_fallback=False
|
190 |
-
)
|
191 |
-
|
192 |
-
# If first attempt fails, try direct URL extraction
|
193 |
-
if not content:
|
194 |
-
content = extract(
|
195 |
-
url,
|
196 |
-
include_comments=False,
|
197 |
-
include_tables=True,
|
198 |
-
no_fallback=False
|
199 |
-
)
|
200 |
-
|
201 |
-
# Return content with optional length limit
|
202 |
-
if content and max_chars:
|
203 |
-
return content[:max_chars]
|
204 |
-
return content or ""
|
205 |
-
|
206 |
-
except requests.Timeout:
|
207 |
-
logger.error(f"Timeout error while scraping {url}")
|
208 |
-
return ""
|
209 |
except Exception as e:
|
210 |
-
logger.error(f"Error scraping {url}: {
|
211 |
return ""
|
212 |
|
213 |
def rephrase_query(chat_history, query, temperature=0.2):
|
@@ -343,65 +266,19 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
343 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
344 |
return "Error: Unable to assess relevance and summarize"
|
345 |
|
346 |
-
def scrape_full_content(url,
|
347 |
-
"""
|
348 |
-
Unified content scraper that supports multiple scraping methods.
|
349 |
-
|
350 |
-
Args:
|
351 |
-
url (str): The URL to scrape
|
352 |
-
scraper (str): Scraping method to use ('bs4', 'trafilatura', 'scrapy', 'newspaper')
|
353 |
-
max_chars (int): Maximum number of characters to return
|
354 |
-
timeout (int): Request timeout in seconds
|
355 |
-
|
356 |
-
Returns:
|
357 |
-
str: Scraped content or empty string if scraping fails
|
358 |
-
"""
|
359 |
try:
|
360 |
logger.info(f"Scraping full content from: {url}")
|
361 |
|
362 |
-
|
|
|
363 |
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
response.raise_for_status()
|
368 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
369 |
-
|
370 |
-
# Try to find the main content
|
371 |
-
main_content = (
|
372 |
-
soup.find('main') or
|
373 |
-
soup.find('article') or
|
374 |
-
soup.find('div', class_='content')
|
375 |
-
)
|
376 |
-
|
377 |
-
content = main_content.get_text(strip=True, separator='\n') if main_content else soup.get_text(strip=True, separator='\n')
|
378 |
-
|
379 |
-
elif scraper == "trafilatura":
|
380 |
-
content = scrape_with_trafilatura(url, max_chars, timeout)
|
381 |
-
|
382 |
-
elif scraper == "scrapy":
|
383 |
-
content = scrape_with_scrapy(url, timeout)
|
384 |
-
|
385 |
-
elif scraper == "newspaper":
|
386 |
-
article = Article(url)
|
387 |
-
article.download()
|
388 |
-
article.parse()
|
389 |
-
content = article.text
|
390 |
-
|
391 |
-
else:
|
392 |
-
logger.error(f"Unknown scraper: {scraper}")
|
393 |
-
return ""
|
394 |
-
|
395 |
-
# Standardize whitespace and limit content length
|
396 |
-
if content:
|
397 |
-
content = " ".join(content.split()) # Standardize whitespace
|
398 |
-
return content[:max_chars] if max_chars else content
|
399 |
-
|
400 |
-
return ""
|
401 |
|
402 |
-
|
403 |
-
|
404 |
-
return ""
|
405 |
except Exception as e:
|
406 |
logger.error(f"Error scraping full content from {url}: {e}")
|
407 |
return ""
|
@@ -445,7 +322,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
445 |
logger.error(f"Error in LLM summarization: {e}")
|
446 |
return "Error: Unable to generate a summary. Please try again."
|
447 |
|
448 |
-
def search_and_scrape(query, chat_history, num_results=5,
|
449 |
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
|
450 |
try:
|
451 |
# Step 1: Rephrase the Query
|
@@ -532,8 +409,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
532 |
try:
|
533 |
logger.info(f"Scraping content from: {url}")
|
534 |
|
535 |
-
|
536 |
-
content = scrape_full_content(url, scraper, max_chars, timeout)
|
537 |
|
538 |
if not content:
|
539 |
logger.warning(f"Failed to scrape content from {url}")
|
@@ -542,8 +418,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
542 |
scraped_content.append({
|
543 |
"title": title,
|
544 |
"url": url,
|
545 |
-
"content": content
|
546 |
-
"scraper": scraper
|
547 |
})
|
548 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
549 |
except requests.exceptions.RequestException as e:
|
|
|
27 |
from scrapy.signalmanager import dispatcher
|
28 |
from scrapy.utils.log import configure_logging
|
29 |
from newspaper import Article
|
30 |
+
import html2text
|
31 |
from PyPDF2 import PdfReader
|
32 |
+
from io import BytesIO
|
33 |
+
|
34 |
|
35 |
|
36 |
# Load environment variables from a .env file
|
|
|
82 |
except ValueError:
|
83 |
return False
|
84 |
|
85 |
+
def is_pdf(url):
|
86 |
+
try:
|
87 |
+
response = requests.head(url, allow_redirects=True)
|
88 |
+
content_type = response.headers.get('Content-Type', '').lower()
|
89 |
+
return 'application/pdf' in content_type
|
90 |
+
except Exception as e:
|
91 |
+
logger.error(f"Error checking content type for {url}: {e}")
|
92 |
+
return False
|
93 |
+
|
94 |
+
def scrape_pdf(url):
|
95 |
+
logger.info(f"Scraping PDF: {url}")
|
96 |
+
try:
|
97 |
+
response = requests.get(url)
|
98 |
+
pdf_file = BytesIO(response.content)
|
99 |
+
pdf_reader = PdfReader(pdf_file)
|
100 |
+
text = ""
|
101 |
+
for page in pdf_reader.pages:
|
102 |
+
text += page.extract_text() + "\n"
|
103 |
+
return text.strip()
|
104 |
+
except Exception as e:
|
105 |
+
logger.error(f"Error scraping PDF {url}: {e}")
|
106 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
def scrape_with_newspaper(url):
|
109 |
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
110 |
try:
|
111 |
+
if is_pdf(url):
|
112 |
+
return scrape_pdf(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
else:
|
|
|
114 |
article = Article(url)
|
115 |
article.download()
|
116 |
article.parse()
|
|
|
119 |
logger.error(f"Error scraping {url} with Newspaper3k: {e}")
|
120 |
return ""
|
121 |
|
122 |
+
def scrape_with_html2text(url):
|
123 |
+
logger.info(f"Starting to scrape with html2text: {url}")
|
124 |
try:
|
125 |
+
if is_pdf(url):
|
126 |
+
return scrape_pdf(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
else:
|
128 |
+
response = requests.get(url)
|
129 |
+
h = html2text.HTML2Text()
|
130 |
+
h.ignore_links = True
|
131 |
+
return h.handle(response.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
except Exception as e:
|
133 |
+
logger.error(f"Error scraping {url} with html2text: {e}")
|
134 |
return ""
|
135 |
|
136 |
def rephrase_query(chat_history, query, temperature=0.2):
|
|
|
266 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
267 |
return "Error: Unable to assess relevance and summarize"
|
268 |
|
269 |
+
def scrape_full_content(url, max_chars=3000):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
try:
|
271 |
logger.info(f"Scraping full content from: {url}")
|
272 |
|
273 |
+
# Try newspaper first
|
274 |
+
content = scrape_with_newspaper(url)
|
275 |
|
276 |
+
# If newspaper fails, try html2text
|
277 |
+
if not content:
|
278 |
+
content = scrape_with_html2text(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
+
# Limit the content to max_chars
|
281 |
+
return content[:max_chars] if content else ""
|
|
|
282 |
except Exception as e:
|
283 |
logger.error(f"Error scraping full content from {url}: {e}")
|
284 |
return ""
|
|
|
322 |
logger.error(f"Error in LLM summarization: {e}")
|
323 |
return "Error: Unable to generate a summary. Please try again."
|
324 |
|
325 |
+
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
326 |
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
|
327 |
try:
|
328 |
# Step 1: Rephrase the Query
|
|
|
409 |
try:
|
410 |
logger.info(f"Scraping content from: {url}")
|
411 |
|
412 |
+
content = scrape_full_content(url, max_chars, timeout)
|
|
|
413 |
|
414 |
if not content:
|
415 |
logger.warning(f"Failed to scrape content from {url}")
|
|
|
418 |
scraped_content.append({
|
419 |
"title": title,
|
420 |
"url": url,
|
421 |
+
"content": content
|
|
|
422 |
})
|
423 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
424 |
except requests.exceptions.RequestException as e:
|