Shreyas094 commited on
Commit
b577b65
1 Parent(s): f57b788

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -172
app.py CHANGED
@@ -27,10 +27,10 @@ from scrapy import signals
27
  from scrapy.signalmanager import dispatcher
28
  from scrapy.utils.log import configure_logging
29
  from newspaper import Article
30
- from io import BytesIO
31
  from PyPDF2 import PdfReader
32
- import logging
33
- import tempfile
34
 
35
 
36
  # Load environment variables from a .env file
@@ -82,62 +82,35 @@ def is_valid_url(url):
82
  except ValueError:
83
  return False
84
 
85
- class NewsSpider(scrapy.Spider):
86
- name = 'news_spider'
87
-
88
- def __init__(self, url=None, *args, **kwargs):
89
- super(NewsSpider, self).__init__(*args, **kwargs)
90
- self.start_urls = [url] if url else []
91
-
92
- def parse(self, response):
93
- content = ' '.join(response.css('p::text').getall())
94
- self.logger.info(f"Scraped content length: {len(content)}")
95
- return {'content': content}
96
-
97
- def scrape_with_scrapy(url, timeout=30):
98
- logger.info(f"Starting to scrape with Scrapy: {url}")
99
- configure_logging(install_root_handler=False)
100
- logging.getLogger('scrapy').setLevel(logging.WARNING)
101
-
102
- results = []
103
-
104
- def spider_results(signal, sender, item, response, spider):
105
- results.append(item)
106
-
107
- process = CrawlerProcess(settings={
108
- 'LOG_ENABLED': True,
109
- 'LOG_LEVEL': 'WARNING',
110
- 'DOWNLOAD_TIMEOUT': timeout
111
- })
112
-
113
- dispatcher.connect(spider_results, signal=signals.item_scraped)
114
-
115
- process.crawl(NewsSpider, url=url)
116
- process.start()
117
-
118
- # Get the content from results
119
- if results:
120
- return results[0]['content']
121
- return ''
122
 
123
  def scrape_with_newspaper(url):
124
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
125
  try:
126
- # Check if the URL is a PDF
127
- response = requests.get(url)
128
- content_type = response.headers.get('Content-Type', '').lower()
129
-
130
- if 'application/pdf' in content_type:
131
- # Handle PDF
132
- logger.info(f"Detected PDF file: {url}")
133
- pdf_file = BytesIO(response.content)
134
- pdf_reader = PdfReader(pdf_file)
135
- text = ""
136
- for page in pdf_reader.pages:
137
- text += page.extract_text() + "\n"
138
- return text.strip()
139
  else:
140
- # Handle regular web page
141
  article = Article(url)
142
  article.download()
143
  article.parse()
@@ -146,68 +119,18 @@ def scrape_with_newspaper(url):
146
  logger.error(f"Error scraping {url} with Newspaper3k: {e}")
147
  return ""
148
 
149
- def scrape_with_bs4(url, session, max_chars=None):
 
150
  try:
151
- response = session.get(url, timeout=5)
152
- response.raise_for_status()
153
- soup = BeautifulSoup(response.content, 'html.parser')
154
-
155
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
156
-
157
- if main_content:
158
- content = main_content.get_text(strip=True, separator='\n')
159
  else:
160
- content = soup.get_text(strip=True, separator='\n')
161
-
162
- return content[:max_chars] if max_chars else content
163
- except Exception as e:
164
- logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
165
- return ""
166
-
167
- def scrape_with_trafilatura(url, max_chars=None, timeout=5):
168
- """
169
- Scrape web content using Trafilatura with simplified error handling and fallback options.
170
-
171
- Args:
172
- url (str): The URL to scrape
173
- max_chars (int, optional): Maximum number of characters to return
174
- timeout (int, optional): Request timeout in seconds
175
-
176
- Returns:
177
- str: Extracted content or empty string if extraction fails
178
- """
179
- try:
180
- # Make the request with timeout
181
- response = requests.get(url, timeout=timeout)
182
- response.raise_for_status()
183
-
184
- # Extract content from the downloaded HTML
185
- content = extract(
186
- response.text,
187
- include_comments=False,
188
- include_tables=True,
189
- no_fallback=False
190
- )
191
-
192
- # If first attempt fails, try direct URL extraction
193
- if not content:
194
- content = extract(
195
- url,
196
- include_comments=False,
197
- include_tables=True,
198
- no_fallback=False
199
- )
200
-
201
- # Return content with optional length limit
202
- if content and max_chars:
203
- return content[:max_chars]
204
- return content or ""
205
-
206
- except requests.Timeout:
207
- logger.error(f"Timeout error while scraping {url}")
208
- return ""
209
  except Exception as e:
210
- logger.error(f"Error scraping {url}: {str(e)}")
211
  return ""
212
 
213
  def rephrase_query(chat_history, query, temperature=0.2):
@@ -343,65 +266,19 @@ Remember to focus on financial aspects and implications in your assessment and s
343
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
344
  return "Error: Unable to assess relevance and summarize"
345
 
346
- def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
347
- """
348
- Unified content scraper that supports multiple scraping methods.
349
-
350
- Args:
351
- url (str): The URL to scrape
352
- scraper (str): Scraping method to use ('bs4', 'trafilatura', 'scrapy', 'newspaper')
353
- max_chars (int): Maximum number of characters to return
354
- timeout (int): Request timeout in seconds
355
-
356
- Returns:
357
- str: Scraped content or empty string if scraping fails
358
- """
359
  try:
360
  logger.info(f"Scraping full content from: {url}")
361
 
362
- content = ""
 
363
 
364
- if scraper == "bs4":
365
- session = requests_retry_session()
366
- response = session.get(url, timeout=timeout)
367
- response.raise_for_status()
368
- soup = BeautifulSoup(response.content, 'html.parser')
369
-
370
- # Try to find the main content
371
- main_content = (
372
- soup.find('main') or
373
- soup.find('article') or
374
- soup.find('div', class_='content')
375
- )
376
-
377
- content = main_content.get_text(strip=True, separator='\n') if main_content else soup.get_text(strip=True, separator='\n')
378
-
379
- elif scraper == "trafilatura":
380
- content = scrape_with_trafilatura(url, max_chars, timeout)
381
-
382
- elif scraper == "scrapy":
383
- content = scrape_with_scrapy(url, timeout)
384
-
385
- elif scraper == "newspaper":
386
- article = Article(url)
387
- article.download()
388
- article.parse()
389
- content = article.text
390
-
391
- else:
392
- logger.error(f"Unknown scraper: {scraper}")
393
- return ""
394
-
395
- # Standardize whitespace and limit content length
396
- if content:
397
- content = " ".join(content.split()) # Standardize whitespace
398
- return content[:max_chars] if max_chars else content
399
-
400
- return ""
401
 
402
- except requests.Timeout:
403
- logger.error(f"Timeout error while scraping {url}")
404
- return ""
405
  except Exception as e:
406
  logger.error(f"Error scraping full content from {url}: {e}")
407
  return ""
@@ -445,7 +322,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
445
  logger.error(f"Error in LLM summarization: {e}")
446
  return "Error: Unable to generate a summary. Please try again."
447
 
448
- def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
449
  engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
450
  try:
451
  # Step 1: Rephrase the Query
@@ -532,8 +409,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
532
  try:
533
  logger.info(f"Scraping content from: {url}")
534
 
535
- # MODIFY: Remove the user agent loop and use a single scraping method
536
- content = scrape_full_content(url, scraper, max_chars, timeout)
537
 
538
  if not content:
539
  logger.warning(f"Failed to scrape content from {url}")
@@ -542,8 +418,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
542
  scraped_content.append({
543
  "title": title,
544
  "url": url,
545
- "content": content,
546
- "scraper": scraper
547
  })
548
  logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
549
  except requests.exceptions.RequestException as e:
 
27
  from scrapy.signalmanager import dispatcher
28
  from scrapy.utils.log import configure_logging
29
  from newspaper import Article
30
+ import html2text
31
  from PyPDF2 import PdfReader
32
+ from io import BytesIO
33
+
34
 
35
 
36
  # Load environment variables from a .env file
 
82
  except ValueError:
83
  return False
84
 
85
+ def is_pdf(url):
86
+ try:
87
+ response = requests.head(url, allow_redirects=True)
88
+ content_type = response.headers.get('Content-Type', '').lower()
89
+ return 'application/pdf' in content_type
90
+ except Exception as e:
91
+ logger.error(f"Error checking content type for {url}: {e}")
92
+ return False
93
+
94
+ def scrape_pdf(url):
95
+ logger.info(f"Scraping PDF: {url}")
96
+ try:
97
+ response = requests.get(url)
98
+ pdf_file = BytesIO(response.content)
99
+ pdf_reader = PdfReader(pdf_file)
100
+ text = ""
101
+ for page in pdf_reader.pages:
102
+ text += page.extract_text() + "\n"
103
+ return text.strip()
104
+ except Exception as e:
105
+ logger.error(f"Error scraping PDF {url}: {e}")
106
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  def scrape_with_newspaper(url):
109
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
110
  try:
111
+ if is_pdf(url):
112
+ return scrape_pdf(url)
 
 
 
 
 
 
 
 
 
 
 
113
  else:
 
114
  article = Article(url)
115
  article.download()
116
  article.parse()
 
119
  logger.error(f"Error scraping {url} with Newspaper3k: {e}")
120
  return ""
121
 
122
+ def scrape_with_html2text(url):
123
+ logger.info(f"Starting to scrape with html2text: {url}")
124
  try:
125
+ if is_pdf(url):
126
+ return scrape_pdf(url)
 
 
 
 
 
 
127
  else:
128
+ response = requests.get(url)
129
+ h = html2text.HTML2Text()
130
+ h.ignore_links = True
131
+ return h.handle(response.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  except Exception as e:
133
+ logger.error(f"Error scraping {url} with html2text: {e}")
134
  return ""
135
 
136
  def rephrase_query(chat_history, query, temperature=0.2):
 
266
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
267
  return "Error: Unable to assess relevance and summarize"
268
 
269
+ def scrape_full_content(url, max_chars=3000):
 
 
 
 
 
 
 
 
 
 
 
 
270
  try:
271
  logger.info(f"Scraping full content from: {url}")
272
 
273
+ # Try newspaper first
274
+ content = scrape_with_newspaper(url)
275
 
276
+ # If newspaper fails, try html2text
277
+ if not content:
278
+ content = scrape_with_html2text(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
+ # Limit the content to max_chars
281
+ return content[:max_chars] if content else ""
 
282
  except Exception as e:
283
  logger.error(f"Error scraping full content from {url}: {e}")
284
  return ""
 
322
  logger.error(f"Error in LLM summarization: {e}")
323
  return "Error: Unable to generate a summary. Please try again."
324
 
325
+ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
326
  engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
327
  try:
328
  # Step 1: Rephrase the Query
 
409
  try:
410
  logger.info(f"Scraping content from: {url}")
411
 
412
+ content = scrape_full_content(url, max_chars, timeout)
 
413
 
414
  if not content:
415
  logger.warning(f"Failed to scrape content from {url}")
 
418
  scraped_content.append({
419
  "title": title,
420
  "url": url,
421
+ "content": content
 
422
  })
423
  logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
424
  except requests.exceptions.RequestException as e: