Shreyas094 commited on
Commit
a5594d9
1 Parent(s): 7954811

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -49
app.py CHANGED
@@ -27,11 +27,9 @@ from scrapy import signals
27
  from scrapy.signalmanager import dispatcher
28
  from scrapy.utils.log import configure_logging
29
  from newspaper import Article
30
- import html2text
31
- from PyPDF2 import PdfReader
32
- from io import BytesIO
33
-
34
-
35
 
36
  # Load environment variables from a .env file
37
  load_dotenv()
@@ -82,55 +80,148 @@ def is_valid_url(url):
82
  except ValueError:
83
  return False
84
 
85
- def is_pdf(url):
86
- try:
87
- response = requests.head(url, allow_redirects=True)
88
- content_type = response.headers.get('Content-Type', '').lower()
89
- return 'application/pdf' in content_type
90
- except Exception as e:
91
- logger.error(f"Error checking content type for {url}: {e}")
92
- return False
93
-
94
- def scrape_pdf(url):
95
- logger.info(f"Scraping PDF: {url}")
96
  try:
97
- response = requests.get(url)
98
- pdf_file = BytesIO(response.content)
99
- pdf_reader = PdfReader(pdf_file)
100
- text = ""
 
 
 
 
 
 
 
101
  for page in pdf_reader.pages:
102
- text += page.extract_text() + "\n"
103
- return text.strip()
 
 
 
 
 
104
  except Exception as e:
105
- logger.error(f"Error scraping PDF {url}: {e}")
106
  return ""
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def scrape_with_newspaper(url):
 
 
 
109
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
110
  try:
111
- if is_pdf(url):
112
- return scrape_pdf(url)
113
- else:
114
- article = Article(url)
115
- article.download()
116
- article.parse()
117
- return article.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
  logger.error(f"Error scraping {url} with Newspaper3k: {e}")
120
  return ""
121
 
122
- def scrape_with_html2text(url):
123
- logger.info(f"Starting to scrape with html2text: {url}")
124
  try:
125
- if is_pdf(url):
126
- return scrape_pdf(url)
 
 
 
 
 
 
127
  else:
128
- response = requests.get(url)
129
- h = html2text.HTML2Text()
130
- h.ignore_links = True
131
- return h.handle(response.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  except Exception as e:
133
- logger.error(f"Error scraping {url} with html2text: {e}")
134
  return ""
135
 
136
  def rephrase_query(chat_history, query, temperature=0.2):
@@ -266,19 +357,42 @@ Remember to focus on financial aspects and implications in your assessment and s
266
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
267
  return "Error: Unable to assess relevance and summarize"
268
 
269
- def scrape_full_content(url, max_chars=3000):
270
  try:
271
  logger.info(f"Scraping full content from: {url}")
272
 
273
- # Try newspaper first
274
- content = scrape_with_newspaper(url)
 
275
 
276
- # If newspaper fails, try html2text
277
- if not content:
278
- content = scrape_with_html2text(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  # Limit the content to max_chars
281
  return content[:max_chars] if content else ""
 
 
 
282
  except Exception as e:
283
  logger.error(f"Error scraping full content from {url}: {e}")
284
  return ""
@@ -322,7 +436,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
322
  logger.error(f"Error in LLM summarization: {e}")
323
  return "Error: Unable to generate a summary. Please try again."
324
 
325
- def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
326
  engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
327
  try:
328
  # Step 1: Rephrase the Query
@@ -407,9 +521,9 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
407
  continue
408
 
409
  try:
410
- logger.info(f"Scraping content from: {url}")
411
 
412
- content = scrape_full_content(url, max_chars, timeout)
413
 
414
  if not content:
415
  logger.warning(f"Failed to scrape content from {url}")
@@ -418,7 +532,8 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
418
  scraped_content.append({
419
  "title": title,
420
  "url": url,
421
- "content": content
 
422
  })
423
  logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
424
  except requests.exceptions.RequestException as e:
 
27
  from scrapy.signalmanager import dispatcher
28
  from scrapy.utils.log import configure_logging
29
  from newspaper import Article
30
+ import PyPDF2
31
+ import io
32
+ import requests
 
 
33
 
34
  # Load environment variables from a .env file
35
  load_dotenv()
 
80
  except ValueError:
81
  return False
82
 
83
+ def scrape_pdf_content(url, max_chars=3000, timeout=5):
 
 
 
 
 
 
 
 
 
 
84
  try:
85
+ logger.info(f"Scraping PDF content from: {url}")
86
+
87
+ # Download the PDF file
88
+ response = requests.get(url, timeout=timeout)
89
+ response.raise_for_status()
90
+
91
+ # Create a PDF reader object
92
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
93
+
94
+ # Extract text from all pages
95
+ content = ""
96
  for page in pdf_reader.pages:
97
+ content += page.extract_text() + "\n"
98
+
99
+ # Limit the content to max_chars
100
+ return content[:max_chars] if content else ""
101
+ except requests.Timeout:
102
+ logger.error(f"Timeout error while scraping PDF content from {url}")
103
+ return ""
104
  except Exception as e:
105
+ logger.error(f"Error scraping PDF content from {url}: {e}")
106
  return ""
107
 
108
+
109
+ class NewsSpider(scrapy.Spider):
110
+ name = 'news_spider'
111
+
112
+ def __init__(self, url=None, *args, **kwargs):
113
+ super(NewsSpider, self).__init__(*args, **kwargs)
114
+ self.start_urls = [url] if url else []
115
+
116
+ def parse(self, response):
117
+ content = ' '.join(response.css('p::text').getall())
118
+ self.logger.info(f"Scraped content length: {len(content)}")
119
+ return {'content': content}
120
+
121
+ def scrape_with_scrapy(url, timeout=30):
122
+ logger.info(f"Starting to scrape with Scrapy: {url}")
123
+ configure_logging(install_root_handler=False)
124
+ logging.getLogger('scrapy').setLevel(logging.WARNING)
125
+
126
+ results = []
127
+
128
+ def spider_results(signal, sender, item, response, spider):
129
+ results.append(item)
130
+
131
+ process = CrawlerProcess(settings={
132
+ 'LOG_ENABLED': True,
133
+ 'LOG_LEVEL': 'WARNING',
134
+ 'DOWNLOAD_TIMEOUT': timeout
135
+ })
136
+
137
+ dispatcher.connect(spider_results, signal=signals.item_scraped)
138
+
139
+ process.crawl(NewsSpider, url=url)
140
+ process.start()
141
+
142
+ # Get the content from results
143
+ if results:
144
+ return results[0]['content']
145
+ return ''
146
+
147
  def scrape_with_newspaper(url):
148
+ if url.lower().endswith('.pdf'):
149
+ return scrape_pdf_content(url)
150
+
151
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
152
  try:
153
+ article = Article(url)
154
+ article.download()
155
+ article.parse()
156
+
157
+ # Combine title and text
158
+ content = f"Title: {article.title}\n\n"
159
+ content += article.text
160
+
161
+ # Add publish date if available
162
+ if article.publish_date:
163
+ content += f"\n\nPublish Date: {article.publish_date}"
164
+
165
+ # Add authors if available
166
+ if article.authors:
167
+ content += f"\n\nAuthors: {', '.join(article.authors)}"
168
+
169
+ # Add top image URL if available
170
+ if article.top_image:
171
+ content += f"\n\nTop Image URL: {article.top_image}"
172
+
173
+ return content
174
  except Exception as e:
175
  logger.error(f"Error scraping {url} with Newspaper3k: {e}")
176
  return ""
177
 
178
+ def scrape_with_bs4(url, session, max_chars=None):
 
179
  try:
180
+ response = session.get(url, timeout=5)
181
+ response.raise_for_status()
182
+ soup = BeautifulSoup(response.content, 'html.parser')
183
+
184
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
185
+
186
+ if main_content:
187
+ content = main_content.get_text(strip=True, separator='\n')
188
  else:
189
+ content = soup.get_text(strip=True, separator='\n')
190
+
191
+ return content[:max_chars] if max_chars else content
192
+ except Exception as e:
193
+ logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
194
+ return ""
195
+
196
+ def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
197
+ try:
198
+ response = requests.get(url, timeout=timeout)
199
+ response.raise_for_status()
200
+ downloaded = response.text
201
+ content = ""
202
+
203
+ if use_beautifulsoup:
204
+ soup = BeautifulSoup(downloaded, "lxml")
205
+ # Convert BeautifulSoup object to a string
206
+ html_string = str(soup)
207
+ # Use Trafilatura's extract function directly on the HTML string
208
+ content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
209
+
210
+ # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
211
+ if not content and use_beautifulsoup:
212
+ logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
213
+ content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
214
+
215
+ # If still no content, use the URL directly
216
+ if not content:
217
+ content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
218
+
219
+ return (content or "")[:max_chars] if max_chars else (content or "")
220
+ except requests.Timeout:
221
+ logger.error(f"Timeout error while scraping {url} with Trafilatura")
222
+ return ""
223
  except Exception as e:
224
+ logger.error(f"Error scraping {url} with Trafilatura: {e}")
225
  return ""
226
 
227
  def rephrase_query(chat_history, query, temperature=0.2):
 
357
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
358
  return "Error: Unable to assess relevance and summarize"
359
 
360
+ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
361
  try:
362
  logger.info(f"Scraping full content from: {url}")
363
 
364
+ # Check if the URL ends with .pdf
365
+ if url.lower().endswith('.pdf'):
366
+ return scrape_pdf_content(url, max_chars, timeout)
367
 
368
+ if scraper == "bs4":
369
+ session = requests_retry_session()
370
+ response = session.get(url, timeout=timeout)
371
+ response.raise_for_status()
372
+ soup = BeautifulSoup(response.content, 'html.parser')
373
+
374
+ # Try to find the main content
375
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
376
+
377
+ if main_content:
378
+ content = main_content.get_text(strip=True, separator='\n')
379
+ else:
380
+ content = soup.get_text(strip=True, separator='\n')
381
+ elif scraper == "trafilatura":
382
+ content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
383
+ elif scraper == "scrapy":
384
+ content = scrape_with_scrapy(url, timeout)
385
+ elif scraper == "newspaper":
386
+ content = scrape_with_newspaper(url)
387
+ else:
388
+ logger.error(f"Unknown scraper: {scraper}")
389
+ return ""
390
 
391
  # Limit the content to max_chars
392
  return content[:max_chars] if content else ""
393
+ except requests.Timeout:
394
+ logger.error(f"Timeout error while scraping full content from {url}")
395
+ return ""
396
  except Exception as e:
397
  logger.error(f"Error scraping full content from {url}: {e}")
398
  return ""
 
436
  logger.error(f"Error in LLM summarization: {e}")
437
  return "Error: Unable to generate a summary. Please try again."
438
 
439
+ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
440
  engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
441
  try:
442
  # Step 1: Rephrase the Query
 
521
  continue
522
 
523
  try:
524
+ logger.info(f"Processing content from: {url}")
525
 
526
+ content = scrape_full_content(url, scraper, max_chars, timeout)
527
 
528
  if not content:
529
  logger.warning(f"Failed to scrape content from {url}")
 
532
  scraped_content.append({
533
  "title": title,
534
  "url": url,
535
+ "content": content,
536
+ "scraper": "pdf" if url.lower().endswith('.pdf') else scraper
537
  })
538
  logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
539
  except requests.exceptions.RequestException as e: