Shreyas094
commited on
Commit
•
a5594d9
1
Parent(s):
7954811
Update app.py
Browse files
app.py
CHANGED
@@ -27,11 +27,9 @@ from scrapy import signals
|
|
27 |
from scrapy.signalmanager import dispatcher
|
28 |
from scrapy.utils.log import configure_logging
|
29 |
from newspaper import Article
|
30 |
-
import
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
|
36 |
# Load environment variables from a .env file
|
37 |
load_dotenv()
|
@@ -82,55 +80,148 @@ def is_valid_url(url):
|
|
82 |
except ValueError:
|
83 |
return False
|
84 |
|
85 |
-
def
|
86 |
-
try:
|
87 |
-
response = requests.head(url, allow_redirects=True)
|
88 |
-
content_type = response.headers.get('Content-Type', '').lower()
|
89 |
-
return 'application/pdf' in content_type
|
90 |
-
except Exception as e:
|
91 |
-
logger.error(f"Error checking content type for {url}: {e}")
|
92 |
-
return False
|
93 |
-
|
94 |
-
def scrape_pdf(url):
|
95 |
-
logger.info(f"Scraping PDF: {url}")
|
96 |
try:
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
for page in pdf_reader.pages:
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
104 |
except Exception as e:
|
105 |
-
logger.error(f"Error scraping PDF {url}: {e}")
|
106 |
return ""
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
def scrape_with_newspaper(url):
|
|
|
|
|
|
|
109 |
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
110 |
try:
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
except Exception as e:
|
119 |
logger.error(f"Error scraping {url} with Newspaper3k: {e}")
|
120 |
return ""
|
121 |
|
122 |
-
def
|
123 |
-
logger.info(f"Starting to scrape with html2text: {url}")
|
124 |
try:
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
else:
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
except Exception as e:
|
133 |
-
logger.error(f"Error scraping {url} with
|
134 |
return ""
|
135 |
|
136 |
def rephrase_query(chat_history, query, temperature=0.2):
|
@@ -266,19 +357,42 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
266 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
267 |
return "Error: Unable to assess relevance and summarize"
|
268 |
|
269 |
-
def scrape_full_content(url, max_chars=3000):
|
270 |
try:
|
271 |
logger.info(f"Scraping full content from: {url}")
|
272 |
|
273 |
-
#
|
274 |
-
|
|
|
275 |
|
276 |
-
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
# Limit the content to max_chars
|
281 |
return content[:max_chars] if content else ""
|
|
|
|
|
|
|
282 |
except Exception as e:
|
283 |
logger.error(f"Error scraping full content from {url}: {e}")
|
284 |
return ""
|
@@ -322,7 +436,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
322 |
logger.error(f"Error in LLM summarization: {e}")
|
323 |
return "Error: Unable to generate a summary. Please try again."
|
324 |
|
325 |
-
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
326 |
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
|
327 |
try:
|
328 |
# Step 1: Rephrase the Query
|
@@ -407,9 +521,9 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
407 |
continue
|
408 |
|
409 |
try:
|
410 |
-
logger.info(f"
|
411 |
|
412 |
-
content = scrape_full_content(url, max_chars, timeout)
|
413 |
|
414 |
if not content:
|
415 |
logger.warning(f"Failed to scrape content from {url}")
|
@@ -418,7 +532,8 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
418 |
scraped_content.append({
|
419 |
"title": title,
|
420 |
"url": url,
|
421 |
-
"content": content
|
|
|
422 |
})
|
423 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
424 |
except requests.exceptions.RequestException as e:
|
|
|
27 |
from scrapy.signalmanager import dispatcher
|
28 |
from scrapy.utils.log import configure_logging
|
29 |
from newspaper import Article
|
30 |
+
import PyPDF2
|
31 |
+
import io
|
32 |
+
import requests
|
|
|
|
|
33 |
|
34 |
# Load environment variables from a .env file
|
35 |
load_dotenv()
|
|
|
80 |
except ValueError:
|
81 |
return False
|
82 |
|
83 |
+
def scrape_pdf_content(url, max_chars=3000, timeout=5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
try:
|
85 |
+
logger.info(f"Scraping PDF content from: {url}")
|
86 |
+
|
87 |
+
# Download the PDF file
|
88 |
+
response = requests.get(url, timeout=timeout)
|
89 |
+
response.raise_for_status()
|
90 |
+
|
91 |
+
# Create a PDF reader object
|
92 |
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
|
93 |
+
|
94 |
+
# Extract text from all pages
|
95 |
+
content = ""
|
96 |
for page in pdf_reader.pages:
|
97 |
+
content += page.extract_text() + "\n"
|
98 |
+
|
99 |
+
# Limit the content to max_chars
|
100 |
+
return content[:max_chars] if content else ""
|
101 |
+
except requests.Timeout:
|
102 |
+
logger.error(f"Timeout error while scraping PDF content from {url}")
|
103 |
+
return ""
|
104 |
except Exception as e:
|
105 |
+
logger.error(f"Error scraping PDF content from {url}: {e}")
|
106 |
return ""
|
107 |
|
108 |
+
|
109 |
+
class NewsSpider(scrapy.Spider):
|
110 |
+
name = 'news_spider'
|
111 |
+
|
112 |
+
def __init__(self, url=None, *args, **kwargs):
|
113 |
+
super(NewsSpider, self).__init__(*args, **kwargs)
|
114 |
+
self.start_urls = [url] if url else []
|
115 |
+
|
116 |
+
def parse(self, response):
|
117 |
+
content = ' '.join(response.css('p::text').getall())
|
118 |
+
self.logger.info(f"Scraped content length: {len(content)}")
|
119 |
+
return {'content': content}
|
120 |
+
|
121 |
+
def scrape_with_scrapy(url, timeout=30):
|
122 |
+
logger.info(f"Starting to scrape with Scrapy: {url}")
|
123 |
+
configure_logging(install_root_handler=False)
|
124 |
+
logging.getLogger('scrapy').setLevel(logging.WARNING)
|
125 |
+
|
126 |
+
results = []
|
127 |
+
|
128 |
+
def spider_results(signal, sender, item, response, spider):
|
129 |
+
results.append(item)
|
130 |
+
|
131 |
+
process = CrawlerProcess(settings={
|
132 |
+
'LOG_ENABLED': True,
|
133 |
+
'LOG_LEVEL': 'WARNING',
|
134 |
+
'DOWNLOAD_TIMEOUT': timeout
|
135 |
+
})
|
136 |
+
|
137 |
+
dispatcher.connect(spider_results, signal=signals.item_scraped)
|
138 |
+
|
139 |
+
process.crawl(NewsSpider, url=url)
|
140 |
+
process.start()
|
141 |
+
|
142 |
+
# Get the content from results
|
143 |
+
if results:
|
144 |
+
return results[0]['content']
|
145 |
+
return ''
|
146 |
+
|
147 |
def scrape_with_newspaper(url):
|
148 |
+
if url.lower().endswith('.pdf'):
|
149 |
+
return scrape_pdf_content(url)
|
150 |
+
|
151 |
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
152 |
try:
|
153 |
+
article = Article(url)
|
154 |
+
article.download()
|
155 |
+
article.parse()
|
156 |
+
|
157 |
+
# Combine title and text
|
158 |
+
content = f"Title: {article.title}\n\n"
|
159 |
+
content += article.text
|
160 |
+
|
161 |
+
# Add publish date if available
|
162 |
+
if article.publish_date:
|
163 |
+
content += f"\n\nPublish Date: {article.publish_date}"
|
164 |
+
|
165 |
+
# Add authors if available
|
166 |
+
if article.authors:
|
167 |
+
content += f"\n\nAuthors: {', '.join(article.authors)}"
|
168 |
+
|
169 |
+
# Add top image URL if available
|
170 |
+
if article.top_image:
|
171 |
+
content += f"\n\nTop Image URL: {article.top_image}"
|
172 |
+
|
173 |
+
return content
|
174 |
except Exception as e:
|
175 |
logger.error(f"Error scraping {url} with Newspaper3k: {e}")
|
176 |
return ""
|
177 |
|
178 |
+
def scrape_with_bs4(url, session, max_chars=None):
|
|
|
179 |
try:
|
180 |
+
response = session.get(url, timeout=5)
|
181 |
+
response.raise_for_status()
|
182 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
183 |
+
|
184 |
+
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
185 |
+
|
186 |
+
if main_content:
|
187 |
+
content = main_content.get_text(strip=True, separator='\n')
|
188 |
else:
|
189 |
+
content = soup.get_text(strip=True, separator='\n')
|
190 |
+
|
191 |
+
return content[:max_chars] if max_chars else content
|
192 |
+
except Exception as e:
|
193 |
+
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
194 |
+
return ""
|
195 |
+
|
196 |
+
def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
|
197 |
+
try:
|
198 |
+
response = requests.get(url, timeout=timeout)
|
199 |
+
response.raise_for_status()
|
200 |
+
downloaded = response.text
|
201 |
+
content = ""
|
202 |
+
|
203 |
+
if use_beautifulsoup:
|
204 |
+
soup = BeautifulSoup(downloaded, "lxml")
|
205 |
+
# Convert BeautifulSoup object to a string
|
206 |
+
html_string = str(soup)
|
207 |
+
# Use Trafilatura's extract function directly on the HTML string
|
208 |
+
content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
|
209 |
+
|
210 |
+
# Fallback mechanism: if BeautifulSoup didn't yield results, try without it
|
211 |
+
if not content and use_beautifulsoup:
|
212 |
+
logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
|
213 |
+
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
214 |
+
|
215 |
+
# If still no content, use the URL directly
|
216 |
+
if not content:
|
217 |
+
content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
|
218 |
+
|
219 |
+
return (content or "")[:max_chars] if max_chars else (content or "")
|
220 |
+
except requests.Timeout:
|
221 |
+
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
222 |
+
return ""
|
223 |
except Exception as e:
|
224 |
+
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
225 |
return ""
|
226 |
|
227 |
def rephrase_query(chat_history, query, temperature=0.2):
|
|
|
357 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
358 |
return "Error: Unable to assess relevance and summarize"
|
359 |
|
360 |
+
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
361 |
try:
|
362 |
logger.info(f"Scraping full content from: {url}")
|
363 |
|
364 |
+
# Check if the URL ends with .pdf
|
365 |
+
if url.lower().endswith('.pdf'):
|
366 |
+
return scrape_pdf_content(url, max_chars, timeout)
|
367 |
|
368 |
+
if scraper == "bs4":
|
369 |
+
session = requests_retry_session()
|
370 |
+
response = session.get(url, timeout=timeout)
|
371 |
+
response.raise_for_status()
|
372 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
373 |
+
|
374 |
+
# Try to find the main content
|
375 |
+
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
376 |
+
|
377 |
+
if main_content:
|
378 |
+
content = main_content.get_text(strip=True, separator='\n')
|
379 |
+
else:
|
380 |
+
content = soup.get_text(strip=True, separator='\n')
|
381 |
+
elif scraper == "trafilatura":
|
382 |
+
content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
|
383 |
+
elif scraper == "scrapy":
|
384 |
+
content = scrape_with_scrapy(url, timeout)
|
385 |
+
elif scraper == "newspaper":
|
386 |
+
content = scrape_with_newspaper(url)
|
387 |
+
else:
|
388 |
+
logger.error(f"Unknown scraper: {scraper}")
|
389 |
+
return ""
|
390 |
|
391 |
# Limit the content to max_chars
|
392 |
return content[:max_chars] if content else ""
|
393 |
+
except requests.Timeout:
|
394 |
+
logger.error(f"Timeout error while scraping full content from {url}")
|
395 |
+
return ""
|
396 |
except Exception as e:
|
397 |
logger.error(f"Error scraping full content from {url}: {e}")
|
398 |
return ""
|
|
|
436 |
logger.error(f"Error in LLM summarization: {e}")
|
437 |
return "Error: Unable to generate a summary. Please try again."
|
438 |
|
439 |
+
def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
|
440 |
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
|
441 |
try:
|
442 |
# Step 1: Rephrase the Query
|
|
|
521 |
continue
|
522 |
|
523 |
try:
|
524 |
+
logger.info(f"Processing content from: {url}")
|
525 |
|
526 |
+
content = scrape_full_content(url, scraper, max_chars, timeout)
|
527 |
|
528 |
if not content:
|
529 |
logger.warning(f"Failed to scrape content from {url}")
|
|
|
532 |
scraped_content.append({
|
533 |
"title": title,
|
534 |
"url": url,
|
535 |
+
"content": content,
|
536 |
+
"scraper": "pdf" if url.lower().endswith('.pdf') else scraper
|
537 |
})
|
538 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
539 |
except requests.exceptions.RequestException as e:
|