Shreyas094
commited on
Commit
•
3817f14
1
Parent(s):
c17888a
Update app.py
Browse files
app.py
CHANGED
@@ -16,6 +16,7 @@ from datetime import datetime
|
|
16 |
import os
|
17 |
from dotenv import load_dotenv
|
18 |
import certifi
|
|
|
19 |
|
20 |
# Load environment variables from a .env file
|
21 |
load_dotenv()
|
@@ -66,12 +67,34 @@ def is_valid_url(url):
|
|
66 |
except ValueError:
|
67 |
return False
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def scrape_with_bs4(url, session):
|
70 |
try:
|
71 |
-
|
|
|
72 |
response.raise_for_status()
|
73 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
74 |
|
|
|
75 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
76 |
|
77 |
if main_content:
|
@@ -79,19 +102,39 @@ def scrape_with_bs4(url, session):
|
|
79 |
else:
|
80 |
content = soup.get_text(strip=True)
|
81 |
|
82 |
-
return content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
except Exception as e:
|
84 |
-
logger.error(f"
|
85 |
-
return
|
86 |
|
|
|
87 |
def scrape_with_trafilatura(url):
|
88 |
try:
|
89 |
-
downloaded = fetch_url(url)
|
|
|
|
|
90 |
content = extract(downloaded)
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
92 |
except Exception as e:
|
93 |
-
logger.error(f"
|
94 |
-
return
|
95 |
|
96 |
def rephrase_query(chat_history, query, temperature=0.2):
|
97 |
system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
|
@@ -252,6 +295,11 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
|
|
252 |
logger.error(f"Error scraping full content from {url}: {e}")
|
253 |
return ""
|
254 |
|
|
|
|
|
|
|
|
|
|
|
255 |
def llm_summarize(query, documents, llm_client, temperature=0.2):
|
256 |
system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
|
257 |
|
@@ -329,7 +377,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
329 |
|
330 |
# Headers for SearXNG request
|
331 |
headers = {
|
332 |
-
'User-Agent':
|
333 |
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
334 |
'Accept-Language': 'en-US,en;q=0.5',
|
335 |
'Origin': 'https://shreyas094-searxng-local.hf.space',
|
|
|
16 |
import os
|
17 |
from dotenv import load_dotenv
|
18 |
import certifi
|
19 |
+
import random
|
20 |
|
21 |
# Load environment variables from a .env file
|
22 |
load_dotenv()
|
|
|
67 |
except ValueError:
|
68 |
return False
|
69 |
|
70 |
+
class ScrapingError(Exception):
|
71 |
+
def __init__(self, message, status_code=None):
|
72 |
+
self.message = message
|
73 |
+
self.status_code = status_code
|
74 |
+
super().__init__(self.message)
|
75 |
+
|
76 |
+
def get_random_user_agent(include_searx=False):
|
77 |
+
user_agents = [
|
78 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
79 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
80 |
+
# Add more user agents...
|
81 |
+
]
|
82 |
+
|
83 |
+
searx_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
84 |
+
|
85 |
+
if include_searx:
|
86 |
+
return searx_agent
|
87 |
+
else:
|
88 |
+
return random.choice(user_agents)
|
89 |
+
|
90 |
+
@retry(stop=stop_after_attempt(1), wait=wait_exponential(multiplier=1, min=4, max=10))
|
91 |
def scrape_with_bs4(url, session):
|
92 |
try:
|
93 |
+
headers = {'User-Agent': get_random_user_agent()}
|
94 |
+
response = session.get(url, timeout=15, headers=headers)
|
95 |
response.raise_for_status()
|
|
|
96 |
|
97 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
98 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
99 |
|
100 |
if main_content:
|
|
|
102 |
else:
|
103 |
content = soup.get_text(strip=True)
|
104 |
|
105 |
+
return {'success': True, 'content': content}
|
106 |
+
except requests.exceptions.HTTPError as e:
|
107 |
+
if e.response.status_code == 403:
|
108 |
+
logger.warning(f"403 Forbidden error for {url}. Retrying with backoff.")
|
109 |
+
raise ScrapingError("403 Forbidden", status_code=403)
|
110 |
+
logger.error(f"HTTP error scraping {url}: {e}")
|
111 |
+
return {'success': False, 'error': str(e), 'status_code': e.response.status_code}
|
112 |
+
except requests.exceptions.Timeout:
|
113 |
+
logger.error(f"Timeout error scraping {url}")
|
114 |
+
return {'success': False, 'error': 'Timeout'}
|
115 |
+
except requests.exceptions.ConnectionError:
|
116 |
+
logger.error(f"Connection error scraping {url}")
|
117 |
+
return {'success': False, 'error': 'Connection Error'}
|
118 |
except Exception as e:
|
119 |
+
logger.error(f"Unexpected error scraping {url}: {e}")
|
120 |
+
return {'success': False, 'error': str(e)}
|
121 |
|
122 |
+
@retry(stop=stop_after_attempt(1), wait=wait_exponential(multiplier=1, min=4, max=10))
|
123 |
def scrape_with_trafilatura(url):
|
124 |
try:
|
125 |
+
downloaded = fetch_url(url, timeout=10)
|
126 |
+
if downloaded is None:
|
127 |
+
raise ScrapingError("Failed to download content")
|
128 |
content = extract(downloaded)
|
129 |
+
if content is None:
|
130 |
+
raise ScrapingError("Failed to extract content")
|
131 |
+
return {'success': True, 'content': content}
|
132 |
+
except ScrapingError as e:
|
133 |
+
logger.error(f"Scraping error for {url}: {e}")
|
134 |
+
return {'success': False, 'error': str(e)}
|
135 |
except Exception as e:
|
136 |
+
logger.error(f"Unexpected error scraping {url} with Trafilatura: {e}")
|
137 |
+
return {'success': False, 'error': str(e)}
|
138 |
|
139 |
def rephrase_query(chat_history, query, temperature=0.2):
|
140 |
system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
|
|
|
295 |
logger.error(f"Error scraping full content from {url}: {e}")
|
296 |
return ""
|
297 |
|
298 |
+
|
299 |
+
def rate_limited_scraping(url, scraper_func, *args, **kwargs):
|
300 |
+
time.sleep(random.uniform(1, 3)) # Random delay between 1-3 seconds
|
301 |
+
return scraper_func(url, *args, **kwargs)
|
302 |
+
|
303 |
def llm_summarize(query, documents, llm_client, temperature=0.2):
|
304 |
system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
|
305 |
|
|
|
377 |
|
378 |
# Headers for SearXNG request
|
379 |
headers = {
|
380 |
+
'User-Agent': get_random_user_agent(include_searx=True),
|
381 |
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
382 |
'Accept-Language': 'en-US,en;q=0.5',
|
383 |
'Origin': 'https://shreyas094-searxng-local.hf.space',
|