Shreyas094 commited on
Commit
3817f14
1 Parent(s): c17888a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -10
app.py CHANGED
@@ -16,6 +16,7 @@ from datetime import datetime
16
  import os
17
  from dotenv import load_dotenv
18
  import certifi
 
19
 
20
  # Load environment variables from a .env file
21
  load_dotenv()
@@ -66,12 +67,34 @@ def is_valid_url(url):
66
  except ValueError:
67
  return False
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def scrape_with_bs4(url, session):
70
  try:
71
- response = session.get(url, timeout=10)
 
72
  response.raise_for_status()
73
- soup = BeautifulSoup(response.content, 'html.parser')
74
 
 
75
  main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
76
 
77
  if main_content:
@@ -79,19 +102,39 @@ def scrape_with_bs4(url, session):
79
  else:
80
  content = soup.get_text(strip=True)
81
 
82
- return content
 
 
 
 
 
 
 
 
 
 
 
 
83
  except Exception as e:
84
- logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
85
- return ""
86
 
 
87
  def scrape_with_trafilatura(url):
88
  try:
89
- downloaded = fetch_url(url)
 
 
90
  content = extract(downloaded)
91
- return content or ""
 
 
 
 
 
92
  except Exception as e:
93
- logger.error(f"Error scraping {url} with Trafilatura: {e}")
94
- return ""
95
 
96
  def rephrase_query(chat_history, query, temperature=0.2):
97
  system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
@@ -252,6 +295,11 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
252
  logger.error(f"Error scraping full content from {url}: {e}")
253
  return ""
254
 
 
 
 
 
 
255
  def llm_summarize(query, documents, llm_client, temperature=0.2):
256
  system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
257
 
@@ -329,7 +377,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
329
 
330
  # Headers for SearXNG request
331
  headers = {
332
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
333
  'Accept': 'application/json, text/javascript, */*; q=0.01',
334
  'Accept-Language': 'en-US,en;q=0.5',
335
  'Origin': 'https://shreyas094-searxng-local.hf.space',
 
16
  import os
17
  from dotenv import load_dotenv
18
  import certifi
19
+ import random
20
 
21
  # Load environment variables from a .env file
22
  load_dotenv()
 
67
  except ValueError:
68
  return False
69
 
70
+ class ScrapingError(Exception):
71
+ def __init__(self, message, status_code=None):
72
+ self.message = message
73
+ self.status_code = status_code
74
+ super().__init__(self.message)
75
+
76
+ def get_random_user_agent(include_searx=False):
77
+ user_agents = [
78
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
79
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
80
+ # Add more user agents...
81
+ ]
82
+
83
+ searx_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
84
+
85
+ if include_searx:
86
+ return searx_agent
87
+ else:
88
+ return random.choice(user_agents)
89
+
90
+ @retry(stop=stop_after_attempt(1), wait=wait_exponential(multiplier=1, min=4, max=10))
91
  def scrape_with_bs4(url, session):
92
  try:
93
+ headers = {'User-Agent': get_random_user_agent()}
94
+ response = session.get(url, timeout=15, headers=headers)
95
  response.raise_for_status()
 
96
 
97
+ soup = BeautifulSoup(response.content, 'html.parser')
98
  main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
99
 
100
  if main_content:
 
102
  else:
103
  content = soup.get_text(strip=True)
104
 
105
+ return {'success': True, 'content': content}
106
+ except requests.exceptions.HTTPError as e:
107
+ if e.response.status_code == 403:
108
+ logger.warning(f"403 Forbidden error for {url}. Retrying with backoff.")
109
+ raise ScrapingError("403 Forbidden", status_code=403)
110
+ logger.error(f"HTTP error scraping {url}: {e}")
111
+ return {'success': False, 'error': str(e), 'status_code': e.response.status_code}
112
+ except requests.exceptions.Timeout:
113
+ logger.error(f"Timeout error scraping {url}")
114
+ return {'success': False, 'error': 'Timeout'}
115
+ except requests.exceptions.ConnectionError:
116
+ logger.error(f"Connection error scraping {url}")
117
+ return {'success': False, 'error': 'Connection Error'}
118
  except Exception as e:
119
+ logger.error(f"Unexpected error scraping {url}: {e}")
120
+ return {'success': False, 'error': str(e)}
121
 
122
+ @retry(stop=stop_after_attempt(1), wait=wait_exponential(multiplier=1, min=4, max=10))
123
  def scrape_with_trafilatura(url):
124
  try:
125
+ downloaded = fetch_url(url, timeout=10)
126
+ if downloaded is None:
127
+ raise ScrapingError("Failed to download content")
128
  content = extract(downloaded)
129
+ if content is None:
130
+ raise ScrapingError("Failed to extract content")
131
+ return {'success': True, 'content': content}
132
+ except ScrapingError as e:
133
+ logger.error(f"Scraping error for {url}: {e}")
134
+ return {'success': False, 'error': str(e)}
135
  except Exception as e:
136
+ logger.error(f"Unexpected error scraping {url} with Trafilatura: {e}")
137
+ return {'success': False, 'error': str(e)}
138
 
139
  def rephrase_query(chat_history, query, temperature=0.2):
140
  system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
 
295
  logger.error(f"Error scraping full content from {url}: {e}")
296
  return ""
297
 
298
+
299
+ def rate_limited_scraping(url, scraper_func, *args, **kwargs):
300
+ time.sleep(random.uniform(1, 3)) # Random delay between 1-3 seconds
301
+ return scraper_func(url, *args, **kwargs)
302
+
303
  def llm_summarize(query, documents, llm_client, temperature=0.2):
304
  system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
305
 
 
377
 
378
  # Headers for SearXNG request
379
  headers = {
380
+ 'User-Agent': get_random_user_agent(include_searx=True),
381
  'Accept': 'application/json, text/javascript, */*; q=0.01',
382
  'Accept-Language': 'en-US,en;q=0.5',
383
  'Origin': 'https://shreyas094-searxng-local.hf.space',