Shreyas094 commited on
Commit
d07bea9
1 Parent(s): 07efc76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -8
app.py CHANGED
@@ -7,6 +7,7 @@ from requests.adapters import HTTPAdapter
7
  from requests.packages.urllib3.util.retry import Retry
8
  from trafilatura import fetch_url, extract
9
  from trafilatura import extract
 
10
  from trafilatura.settings import use_config
11
  from urllib.request import urlopen, Request
12
  import json
@@ -87,11 +88,16 @@ def scrape_with_bs4(url, session, max_chars=None):
87
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
88
  return ""
89
 
90
- def scrape_with_trafilatura(url, max_chars=None):
91
  try:
92
- downloaded = fetch_url(url)
 
 
93
  content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
94
  return (content or "")[:max_chars] if max_chars else (content or "")
 
 
 
95
  except Exception as e:
96
  logger.error(f"Error scraping {url} with Trafilatura: {e}")
97
  return ""
@@ -228,13 +234,13 @@ Remember to focus on financial aspects and implications in your assessment and s
228
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
229
  return "Error: Unable to assess relevance and summarize"
230
 
231
- def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
232
  try:
233
  logger.info(f"Scraping full content from: {url}")
234
 
235
  if scraper == "bs4":
236
  session = requests_retry_session()
237
- response = session.get(url, timeout=10)
238
  response.raise_for_status()
239
  soup = BeautifulSoup(response.content, 'html.parser')
240
 
@@ -246,11 +252,13 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
246
  else:
247
  content = soup.get_text(strip=True, separator='\n')
248
  else: # trafilatura
249
- downloaded = fetch_url(url)
250
- content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
251
 
252
  # Limit the content to max_chars
253
  return content[:max_chars] if content else ""
 
 
 
254
  except Exception as e:
255
  logger.error(f"Error scraping full content from {url}: {e}")
256
  return ""
@@ -298,7 +306,7 @@ from trafilatura.settings import use_config
298
  from urllib.request import urlopen, Request
299
 
300
  def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura", max_chars=3000, time_range="", language="all", category="",
301
- engines=[], safesearch=2, method="GET", llm_temperature=0.2):
302
  try:
303
  # Step 1: Rephrase the Query
304
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
@@ -407,7 +415,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
407
  config = use_config()
408
  config.set("DEFAULT", "USER_AGENT", ua)
409
 
410
- content = scrape_with_trafilatura(url, max_chars)
411
 
412
  if content:
413
  break
 
7
  from requests.packages.urllib3.util.retry import Retry
8
  from trafilatura import fetch_url, extract
9
  from trafilatura import extract
10
+ from requests.exceptions import Timeout
11
  from trafilatura.settings import use_config
12
  from urllib.request import urlopen, Request
13
  import json
 
88
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
89
  return ""
90
 
91
+ def scrape_with_trafilatura(url, max_chars=None, timeout=10):
92
  try:
93
+ response = requests.get(url, timeout=timeout)
94
+ response.raise_for_status()
95
+ downloaded = response.text
96
  content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
97
  return (content or "")[:max_chars] if max_chars else (content or "")
98
+ except Timeout:
99
+ logger.error(f"Timeout error while scraping {url} with Trafilatura")
100
+ return ""
101
  except Exception as e:
102
  logger.error(f"Error scraping {url} with Trafilatura: {e}")
103
  return ""
 
234
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
235
  return "Error: Unable to assess relevance and summarize"
236
 
237
+ def scrape_full_content(url, scraper="trafilatura", max_chars=3000, timeout=10):
238
  try:
239
  logger.info(f"Scraping full content from: {url}")
240
 
241
  if scraper == "bs4":
242
  session = requests_retry_session()
243
+ response = session.get(url, timeout=timeout)
244
  response.raise_for_status()
245
  soup = BeautifulSoup(response.content, 'html.parser')
246
 
 
252
  else:
253
  content = soup.get_text(strip=True, separator='\n')
254
  else: # trafilatura
255
+ content = scrape_with_trafilatura(url, max_chars, timeout)
 
256
 
257
  # Limit the content to max_chars
258
  return content[:max_chars] if content else ""
259
+ except Timeout:
260
+ logger.error(f"Timeout error while scraping full content from {url}")
261
+ return ""
262
  except Exception as e:
263
  logger.error(f"Error scraping full content from {url}: {e}")
264
  return ""
 
306
  from urllib.request import urlopen, Request
307
 
308
  def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura", max_chars=3000, time_range="", language="all", category="",
309
+ engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=10):
310
  try:
311
  # Step 1: Rephrase the Query
312
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
 
415
  config = use_config()
416
  config.set("DEFAULT", "USER_AGENT", ua)
417
 
418
+ content = scrape_with_trafilatura(url, max_chars, timeout=timeout)
419
 
420
  if content:
421
  break