Shreyas094
commited on
Commit
•
84a4885
1
Parent(s):
ef24902
Update app.py
Browse files
app.py
CHANGED
@@ -16,8 +16,6 @@ from datetime import datetime
|
|
16 |
import os
|
17 |
from dotenv import load_dotenv
|
18 |
import certifi
|
19 |
-
import random
|
20 |
-
from tenacity import retry, stop_after_attempt, wait_exponential
|
21 |
|
22 |
# Load environment variables from a .env file
|
23 |
load_dotenv()
|
@@ -68,34 +66,12 @@ def is_valid_url(url):
|
|
68 |
except ValueError:
|
69 |
return False
|
70 |
|
71 |
-
class ScrapingError(Exception):
|
72 |
-
def __init__(self, message, status_code=None):
|
73 |
-
self.message = message
|
74 |
-
self.status_code = status_code
|
75 |
-
super().__init__(self.message)
|
76 |
-
|
77 |
-
def get_random_user_agent(include_searx=False):
|
78 |
-
user_agents = [
|
79 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
80 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
81 |
-
# Add more user agents...
|
82 |
-
]
|
83 |
-
|
84 |
-
searx_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
85 |
-
|
86 |
-
if include_searx:
|
87 |
-
return searx_agent
|
88 |
-
else:
|
89 |
-
return random.choice(user_agents)
|
90 |
-
|
91 |
-
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
92 |
def scrape_with_bs4(url, session):
|
93 |
try:
|
94 |
-
|
95 |
-
response = session.get(url, timeout=15, headers=headers)
|
96 |
response.raise_for_status()
|
97 |
-
|
98 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
99 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
100 |
|
101 |
if main_content:
|
@@ -103,39 +79,19 @@ def scrape_with_bs4(url, session):
|
|
103 |
else:
|
104 |
content = soup.get_text(strip=True)
|
105 |
|
106 |
-
return
|
107 |
-
except requests.exceptions.HTTPError as e:
|
108 |
-
if e.response.status_code == 403:
|
109 |
-
logger.warning(f"403 Forbidden error for {url}. Retrying with backoff.")
|
110 |
-
raise ScrapingError("403 Forbidden", status_code=403)
|
111 |
-
logger.error(f"HTTP error scraping {url}: {e}")
|
112 |
-
return {'success': False, 'error': str(e), 'status_code': e.response.status_code}
|
113 |
-
except requests.exceptions.Timeout:
|
114 |
-
logger.error(f"Timeout error scraping {url}")
|
115 |
-
return {'success': False, 'error': 'Timeout'}
|
116 |
-
except requests.exceptions.ConnectionError:
|
117 |
-
logger.error(f"Connection error scraping {url}")
|
118 |
-
return {'success': False, 'error': 'Connection Error'}
|
119 |
except Exception as e:
|
120 |
-
logger.error(f"
|
121 |
-
return
|
122 |
|
123 |
-
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
124 |
def scrape_with_trafilatura(url):
|
125 |
try:
|
126 |
-
downloaded = fetch_url(url)
|
127 |
-
if downloaded is None:
|
128 |
-
raise ScrapingError("Failed to download content")
|
129 |
content = extract(downloaded)
|
130 |
-
|
131 |
-
raise ScrapingError("Failed to extract content")
|
132 |
-
return {'success': True, 'content': content}
|
133 |
-
except ScrapingError as e:
|
134 |
-
logger.error(f"Scraping error for {url}: {e}")
|
135 |
-
return {'success': False, 'error': str(e)}
|
136 |
except Exception as e:
|
137 |
-
logger.error(f"
|
138 |
-
return
|
139 |
|
140 |
def rephrase_query(chat_history, query, temperature=0.2):
|
141 |
system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
|
@@ -296,11 +252,6 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
|
|
296 |
logger.error(f"Error scraping full content from {url}: {e}")
|
297 |
return ""
|
298 |
|
299 |
-
|
300 |
-
def rate_limited_scraping(url, scraper_func, *args, **kwargs):
|
301 |
-
time.sleep(random.uniform(1, 3)) # Random delay between 1-3 seconds
|
302 |
-
return scraper_func(url, *args, **kwargs)
|
303 |
-
|
304 |
def llm_summarize(query, documents, llm_client, temperature=0.2):
|
305 |
system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
|
306 |
|
@@ -378,7 +329,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
378 |
|
379 |
# Headers for SearXNG request
|
380 |
headers = {
|
381 |
-
'User-Agent':
|
382 |
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
383 |
'Accept-Language': 'en-US,en;q=0.5',
|
384 |
'Origin': 'https://shreyas094-searxng-local.hf.space',
|
@@ -415,7 +366,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
415 |
logger.warning("No results returned from SearXNG.")
|
416 |
return "No results found for the given query."
|
417 |
|
418 |
-
|
419 |
|
420 |
for result in search_results.get('results', [])[:num_results]:
|
421 |
url = result.get('url', '')
|
@@ -428,24 +379,41 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
428 |
try:
|
429 |
logger.info(f"Scraping content from: {url}")
|
430 |
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
|
|
|
|
435 |
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
scraped_content.append({
|
446 |
"title": title,
|
447 |
"url": url,
|
448 |
-
"content": content,
|
449 |
"scraper": scraper
|
450 |
})
|
451 |
except requests.exceptions.RequestException as e:
|
@@ -561,4 +529,4 @@ iface = gr.ChatInterface(
|
|
561 |
|
562 |
if __name__ == "__main__":
|
563 |
logger.info("Starting the SearXNG Scraper for Financial News using ChatInterface with Advanced Parameters")
|
564 |
-
iface.launch(share=True)
|
|
|
16 |
import os
|
17 |
from dotenv import load_dotenv
|
18 |
import certifi
|
|
|
|
|
19 |
|
20 |
# Load environment variables from a .env file
|
21 |
load_dotenv()
|
|
|
66 |
except ValueError:
|
67 |
return False
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def scrape_with_bs4(url, session):
|
70 |
try:
|
71 |
+
response = session.get(url, timeout=10)
|
|
|
72 |
response.raise_for_status()
|
|
|
73 |
soup = BeautifulSoup(response.content, 'html.parser')
|
74 |
+
|
75 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
76 |
|
77 |
if main_content:
|
|
|
79 |
else:
|
80 |
content = soup.get_text(strip=True)
|
81 |
|
82 |
+
return content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
except Exception as e:
|
84 |
+
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
85 |
+
return ""
|
86 |
|
|
|
87 |
def scrape_with_trafilatura(url):
|
88 |
try:
|
89 |
+
downloaded = fetch_url(url)
|
|
|
|
|
90 |
content = extract(downloaded)
|
91 |
+
return content or ""
|
|
|
|
|
|
|
|
|
|
|
92 |
except Exception as e:
|
93 |
+
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
94 |
+
return ""
|
95 |
|
96 |
def rephrase_query(chat_history, query, temperature=0.2):
|
97 |
system_prompt = """You are a highly intelligent conversational chatbot. Your task is to analyze the given context and new query, then decide whether to rephrase the query with or without incorporating the context. Follow these steps:
|
|
|
252 |
logger.error(f"Error scraping full content from {url}: {e}")
|
253 |
return ""
|
254 |
|
|
|
|
|
|
|
|
|
|
|
255 |
def llm_summarize(query, documents, llm_client, temperature=0.2):
|
256 |
system_prompt = """You are Sentinel, a world class Financial analysis AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
|
257 |
|
|
|
329 |
|
330 |
# Headers for SearXNG request
|
331 |
headers = {
|
332 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
333 |
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
334 |
'Accept-Language': 'en-US,en;q=0.5',
|
335 |
'Origin': 'https://shreyas094-searxng-local.hf.space',
|
|
|
366 |
logger.warning("No results returned from SearXNG.")
|
367 |
return "No results found for the given query."
|
368 |
|
369 |
+
scraped_content = []
|
370 |
|
371 |
for result in search_results.get('results', [])[:num_results]:
|
372 |
url = result.get('url', '')
|
|
|
379 |
try:
|
380 |
logger.info(f"Scraping content from: {url}")
|
381 |
|
382 |
+
# Implement a retry mechanism with different user agents
|
383 |
+
user_agents = [
|
384 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
385 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
386 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
387 |
+
]
|
388 |
|
389 |
+
content = ""
|
390 |
+
for ua in user_agents:
|
391 |
+
try:
|
392 |
+
if scraper == "bs4":
|
393 |
+
session.headers.update({'User-Agent': ua})
|
394 |
+
content = scrape_with_bs4(url, session)
|
395 |
+
else: # trafilatura
|
396 |
+
downloaded = fetch_url(url, headers={'User-Agent': ua})
|
397 |
+
content = extract(downloaded)
|
398 |
+
|
399 |
+
if content:
|
400 |
+
break
|
401 |
+
except requests.exceptions.HTTPError as e:
|
402 |
+
if e.response.status_code == 403:
|
403 |
+
logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
|
404 |
+
continue
|
405 |
+
else:
|
406 |
+
raise
|
407 |
+
|
408 |
+
if not content:
|
409 |
+
logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
|
410 |
+
continue
|
411 |
+
|
412 |
+
# Limit content to max_chars
|
413 |
scraped_content.append({
|
414 |
"title": title,
|
415 |
"url": url,
|
416 |
+
"content": content[:max_chars],
|
417 |
"scraper": scraper
|
418 |
})
|
419 |
except requests.exceptions.RequestException as e:
|
|
|
529 |
|
530 |
if __name__ == "__main__":
|
531 |
logger.info("Starting the SearXNG Scraper for Financial News using ChatInterface with Advanced Parameters")
|
532 |
+
iface.launch(share=True)
|