Shreyas094 commited on
Commit
c6a0be6
1 Parent(s): 6552a74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -30
app.py CHANGED
@@ -142,35 +142,50 @@ def scrape_with_bs4(url, session, max_chars=None):
142
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
143
  return ""
144
 
145
- def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
 
 
 
 
 
 
 
 
 
 
 
146
  try:
 
147
  response = requests.get(url, timeout=timeout)
148
  response.raise_for_status()
149
- downloaded = response.text
150
- content = ""
151
-
152
- if use_beautifulsoup:
153
- soup = BeautifulSoup(downloaded, "lxml")
154
- # Convert BeautifulSoup object to a string
155
- html_string = str(soup)
156
- # Use Trafilatura's extract function directly on the HTML string
157
- content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
158
 
159
- # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
160
- if not content and use_beautifulsoup:
161
- logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
162
- content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
 
 
 
163
 
164
- # If still no content, use the URL directly
165
  if not content:
166
- content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
 
 
 
 
 
 
 
 
 
 
167
 
168
- return (content or "")[:max_chars] if max_chars else (content or "")
169
  except requests.Timeout:
170
- logger.error(f"Timeout error while scraping {url} with Trafilatura")
171
  return ""
172
  except Exception as e:
173
- logger.error(f"Error scraping {url} with Trafilatura: {e}")
174
  return ""
175
 
176
  def rephrase_query(chat_history, query, temperature=0.2):
@@ -307,9 +322,23 @@ Remember to focus on financial aspects and implications in your assessment and s
307
  return "Error: Unable to assess relevance and summarize"
308
 
309
  def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
 
 
 
 
 
 
 
 
 
 
 
 
310
  try:
311
  logger.info(f"Scraping full content from: {url}")
312
 
 
 
313
  if scraper == "bs4":
314
  session = requests_retry_session()
315
  response = session.get(url, timeout=timeout)
@@ -317,26 +346,39 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
317
  soup = BeautifulSoup(response.content, 'html.parser')
318
 
319
  # Try to find the main content
320
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
 
 
 
 
 
 
321
 
322
- if main_content:
323
- content = main_content.get_text(strip=True, separator='\n')
324
- else:
325
- content = soup.get_text(strip=True, separator='\n')
326
  elif scraper == "trafilatura":
327
- content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
 
328
  elif scraper == "scrapy":
329
  content = scrape_with_scrapy(url, timeout)
 
330
  elif scraper == "newspaper":
331
- content = scrape_with_newspaper(url)
 
 
 
 
332
  else:
333
  logger.error(f"Unknown scraper: {scraper}")
334
  return ""
335
 
336
- # Limit the content to max_chars
337
- return content[:max_chars] if content else ""
338
- except Timeout:
339
- logger.error(f"Timeout error while scraping full content from {url}")
 
 
 
 
 
340
  return ""
341
  except Exception as e:
342
  logger.error(f"Error scraping full content from {url}: {e}")
 
142
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
143
  return ""
144
 
145
+ def scrape_with_trafilatura(url, max_chars=None, timeout=5):
146
+ """
147
+ Scrape web content using Trafilatura with simplified error handling and fallback options.
148
+
149
+ Args:
150
+ url (str): The URL to scrape
151
+ max_chars (int, optional): Maximum number of characters to return
152
+ timeout (int, optional): Request timeout in seconds
153
+
154
+ Returns:
155
+ str: Extracted content or empty string if extraction fails
156
+ """
157
  try:
158
+ # Make the request with timeout
159
  response = requests.get(url, timeout=timeout)
160
  response.raise_for_status()
 
 
 
 
 
 
 
 
 
161
 
162
+ # Extract content from the downloaded HTML
163
+ content = extract(
164
+ response.text,
165
+ include_comments=False,
166
+ include_tables=True,
167
+ no_fallback=False
168
+ )
169
 
170
+ # If first attempt fails, try direct URL extraction
171
  if not content:
172
+ content = extract(
173
+ url,
174
+ include_comments=False,
175
+ include_tables=True,
176
+ no_fallback=False
177
+ )
178
+
179
+ # Return content with optional length limit
180
+ if content and max_chars:
181
+ return content[:max_chars]
182
+ return content or ""
183
 
 
184
  except requests.Timeout:
185
+ logger.error(f"Timeout error while scraping {url}")
186
  return ""
187
  except Exception as e:
188
+ logger.error(f"Error scraping {url}: {str(e)}")
189
  return ""
190
 
191
  def rephrase_query(chat_history, query, temperature=0.2):
 
322
  return "Error: Unable to assess relevance and summarize"
323
 
324
  def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
325
+ """
326
+ Unified content scraper that supports multiple scraping methods.
327
+
328
+ Args:
329
+ url (str): The URL to scrape
330
+ scraper (str): Scraping method to use ('bs4', 'trafilatura', 'scrapy', 'newspaper')
331
+ max_chars (int): Maximum number of characters to return
332
+ timeout (int): Request timeout in seconds
333
+
334
+ Returns:
335
+ str: Scraped content or empty string if scraping fails
336
+ """
337
  try:
338
  logger.info(f"Scraping full content from: {url}")
339
 
340
+ content = ""
341
+
342
  if scraper == "bs4":
343
  session = requests_retry_session()
344
  response = session.get(url, timeout=timeout)
 
346
  soup = BeautifulSoup(response.content, 'html.parser')
347
 
348
  # Try to find the main content
349
+ main_content = (
350
+ soup.find('main') or
351
+ soup.find('article') or
352
+ soup.find('div', class_='content')
353
+ )
354
+
355
+ content = main_content.get_text(strip=True, separator='\n') if main_content else soup.get_text(strip=True, separator='\n')
356
 
 
 
 
 
357
  elif scraper == "trafilatura":
358
+ content = scrape_with_trafilatura(url, max_chars, timeout)
359
+
360
  elif scraper == "scrapy":
361
  content = scrape_with_scrapy(url, timeout)
362
+
363
  elif scraper == "newspaper":
364
+ article = Article(url)
365
+ article.download()
366
+ article.parse()
367
+ content = article.text
368
+
369
  else:
370
  logger.error(f"Unknown scraper: {scraper}")
371
  return ""
372
 
373
+ # Standardize whitespace and limit content length
374
+ if content:
375
+ content = " ".join(content.split()) # Standardize whitespace
376
+ return content[:max_chars] if max_chars else content
377
+
378
+ return ""
379
+
380
+ except requests.Timeout:
381
+ logger.error(f"Timeout error while scraping {url}")
382
  return ""
383
  except Exception as e:
384
  logger.error(f"Error scraping full content from {url}: {e}")