Shreyas094 commited on
Commit
84a4885
1 Parent(s): ef24902

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -75
app.py CHANGED
@@ -16,8 +16,6 @@ from datetime import datetime
16
  import os
17
  from dotenv import load_dotenv
18
  import certifi
19
- import random
20
- from tenacity import retry, stop_after_attempt, wait_exponential
21
 
22
  # Load environment variables from a .env file
23
  load_dotenv()
@@ -68,34 +66,12 @@ def is_valid_url(url):
68
  except ValueError:
69
  return False
70
 
71
- class ScrapingError(Exception):
72
- def __init__(self, message, status_code=None):
73
- self.message = message
74
- self.status_code = status_code
75
- super().__init__(self.message)
76
-
77
- def get_random_user_agent(include_searx=False):
78
- user_agents = [
79
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
80
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
81
- # Add more user agents...
82
- ]
83
-
84
- searx_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
85
-
86
- if include_searx:
87
- return searx_agent
88
- else:
89
- return random.choice(user_agents)
90
-
91
- @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
92
  def scrape_with_bs4(url, session):
93
  try:
94
- headers = {'User-Agent': get_random_user_agent()}
95
- response = session.get(url, timeout=15, headers=headers)
96
  response.raise_for_status()
97
-
98
  soup = BeautifulSoup(response.content, 'html.parser')
 
99
  main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
100
 
101
  if main_content:
@@ -103,39 +79,19 @@ def scrape_with_bs4(url, session):
103
  else:
104
  content = soup.get_text(strip=True)
105
 
106
- return {'success': True, 'content': content}
107
- except requests.exceptions.HTTPError as e:
108
- if e.response.status_code == 403:
109
- logger.warning(f"403 Forbidden error for {url}. Retrying with backoff.")
110
- raise ScrapingError("403 Forbidden", status_code=403)
111
- logger.error(f"HTTP error scraping {url}: {e}")
112
- return {'success': False, 'error': str(e), 'status_code': e.response.status_code}
113
- except requests.exceptions.Timeout:
114
- logger.error(f"Timeout error scraping {url}")
115
- return {'success': False, 'error': 'Timeout'}
116
- except requests.exceptions.ConnectionError:
117
- logger.error(f"Connection error scraping {url}")
118
- return {'success': False, 'error': 'Connection Error'}
119
  except Exception as e:
120
- logger.error(f"Unexpected error scraping {url}: {e}")
121
- return {'success': False, 'error': str(e)}
122
 
123
- @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
124
  def scrape_with_trafilatura(url):
125
  try:
126
- downloaded = fetch_url(url) # Remove the timeout parameter
127
- if downloaded is None:
128
- raise ScrapingError("Failed to download content")
129
  content = extract(downloaded)
130
- if content is None:
131
- raise ScrapingError("Failed to extract content")
132
- return {'success': True, 'content': content}
133
- except ScrapingError as e:
134
- logger.error(f"Scraping error for {url}: {e}")
135
- return {'success': False, 'error': str(e)}
136
  except Exception as e:
137
- logger.error(f"Unexpected error scraping {url} with Trafilatura: {e}")
138
- return {'success': False, 'error': str(e)}
139
 
140
  def rephrase_query(chat_history, query, temperature=0.2):
141
  system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
@@ -296,11 +252,6 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
296
  logger.error(f"Error scraping full content from {url}: {e}")
297
  return ""
298
 
299
-
300
- def rate_limited_scraping(url, scraper_func, *args, **kwargs):
301
- time.sleep(random.uniform(1, 3)) # Random delay between 1-3 seconds
302
- return scraper_func(url, *args, **kwargs)
303
-
304
  def llm_summarize(query, documents, llm_client, temperature=0.2):
305
  system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
306
 
@@ -378,7 +329,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
378
 
379
  # Headers for SearXNG request
380
  headers = {
381
- 'User-Agent': get_random_user_agent(include_searx=True),
382
  'Accept': 'application/json, text/javascript, */*; q=0.01',
383
  'Accept-Language': 'en-US,en;q=0.5',
384
  'Origin': 'https://shreyas094-searxng-local.hf.space',
@@ -415,7 +366,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
415
  logger.warning("No results returned from SearXNG.")
416
  return "No results found for the given query."
417
 
418
- scraped_content = []
419
 
420
  for result in search_results.get('results', [])[:num_results]:
421
  url = result.get('url', '')
@@ -428,24 +379,41 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
428
  try:
429
  logger.info(f"Scraping content from: {url}")
430
 
431
- if scraper == "bs4":
432
- content = scrape_with_bs4(url, session)
433
- else: # trafilatura
434
- content = scrape_with_trafilatura(url)
 
 
435
 
436
- # Handle different types of content and limit to max_chars
437
- if isinstance(content, dict) and 'content' in content:
438
- content['content'] = content['content'][:max_chars]
439
- elif isinstance(content, str):
440
- content = content[:max_chars]
441
- else:
442
- logger.warning(f"Unexpected content type for URL: {url}")
443
- content = str(content)[:max_chars]
444
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  scraped_content.append({
446
  "title": title,
447
  "url": url,
448
- "content": content,
449
  "scraper": scraper
450
  })
451
  except requests.exceptions.RequestException as e:
@@ -561,4 +529,4 @@ iface = gr.ChatInterface(
561
 
562
  if __name__ == "__main__":
563
  logger.info("Starting the SearXNG Scraper for Financial News using ChatInterface with Advanced Parameters")
564
- iface.launch(share=True)
 
16
  import os
17
  from dotenv import load_dotenv
18
  import certifi
 
 
19
 
20
  # Load environment variables from a .env file
21
  load_dotenv()
 
66
  except ValueError:
67
  return False
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def scrape_with_bs4(url, session):
70
  try:
71
+ response = session.get(url, timeout=10)
 
72
  response.raise_for_status()
 
73
  soup = BeautifulSoup(response.content, 'html.parser')
74
+
75
  main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
76
 
77
  if main_content:
 
79
  else:
80
  content = soup.get_text(strip=True)
81
 
82
+ return content
 
 
 
 
 
 
 
 
 
 
 
 
83
  except Exception as e:
84
+ logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
85
+ return ""
86
 
 
87
  def scrape_with_trafilatura(url):
88
  try:
89
+ downloaded = fetch_url(url)
 
 
90
  content = extract(downloaded)
91
+ return content or ""
 
 
 
 
 
92
  except Exception as e:
93
+ logger.error(f"Error scraping {url} with Trafilatura: {e}")
94
+ return ""
95
 
96
  def rephrase_query(chat_history, query, temperature=0.2):
97
  system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
 
252
  logger.error(f"Error scraping full content from {url}: {e}")
253
  return ""
254
 
 
 
 
 
 
255
  def llm_summarize(query, documents, llm_client, temperature=0.2):
256
  system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
257
 
 
329
 
330
  # Headers for SearXNG request
331
  headers = {
332
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
333
  'Accept': 'application/json, text/javascript, */*; q=0.01',
334
  'Accept-Language': 'en-US,en;q=0.5',
335
  'Origin': 'https://shreyas094-searxng-local.hf.space',
 
366
  logger.warning("No results returned from SearXNG.")
367
  return "No results found for the given query."
368
 
369
+ scraped_content = []
370
 
371
  for result in search_results.get('results', [])[:num_results]:
372
  url = result.get('url', '')
 
379
  try:
380
  logger.info(f"Scraping content from: {url}")
381
 
382
+ # Implement a retry mechanism with different user agents
383
+ user_agents = [
384
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
385
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
386
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
387
+ ]
388
 
389
+ content = ""
390
+ for ua in user_agents:
391
+ try:
392
+ if scraper == "bs4":
393
+ session.headers.update({'User-Agent': ua})
394
+ content = scrape_with_bs4(url, session)
395
+ else: # trafilatura
396
+ downloaded = fetch_url(url, headers={'User-Agent': ua})
397
+ content = extract(downloaded)
398
+
399
+ if content:
400
+ break
401
+ except requests.exceptions.HTTPError as e:
402
+ if e.response.status_code == 403:
403
+ logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
404
+ continue
405
+ else:
406
+ raise
407
+
408
+ if not content:
409
+ logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
410
+ continue
411
+
412
+ # Limit content to max_chars
413
  scraped_content.append({
414
  "title": title,
415
  "url": url,
416
+ "content": content[:max_chars],
417
  "scraper": scraper
418
  })
419
  except requests.exceptions.RequestException as e:
 
529
 
530
  if __name__ == "__main__":
531
  logger.info("Starting the SearXNG Scraper for Financial News using ChatInterface with Advanced Parameters")
532
+ iface.launch(share=True)