Shreyas094
commited on
Commit
•
d07bea9
1
Parent(s):
07efc76
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ from requests.adapters import HTTPAdapter
|
|
7 |
from requests.packages.urllib3.util.retry import Retry
|
8 |
from trafilatura import fetch_url, extract
|
9 |
from trafilatura import extract
|
|
|
10 |
from trafilatura.settings import use_config
|
11 |
from urllib.request import urlopen, Request
|
12 |
import json
|
@@ -87,11 +88,16 @@ def scrape_with_bs4(url, session, max_chars=None):
|
|
87 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
88 |
return ""
|
89 |
|
90 |
-
def scrape_with_trafilatura(url, max_chars=None):
|
91 |
try:
|
92 |
-
|
|
|
|
|
93 |
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
94 |
return (content or "")[:max_chars] if max_chars else (content or "")
|
|
|
|
|
|
|
95 |
except Exception as e:
|
96 |
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
97 |
return ""
|
@@ -228,13 +234,13 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
228 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
229 |
return "Error: Unable to assess relevance and summarize"
|
230 |
|
231 |
-
def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
|
232 |
try:
|
233 |
logger.info(f"Scraping full content from: {url}")
|
234 |
|
235 |
if scraper == "bs4":
|
236 |
session = requests_retry_session()
|
237 |
-
response = session.get(url, timeout=
|
238 |
response.raise_for_status()
|
239 |
soup = BeautifulSoup(response.content, 'html.parser')
|
240 |
|
@@ -246,11 +252,13 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
|
|
246 |
else:
|
247 |
content = soup.get_text(strip=True, separator='\n')
|
248 |
else: # trafilatura
|
249 |
-
|
250 |
-
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
251 |
|
252 |
# Limit the content to max_chars
|
253 |
return content[:max_chars] if content else ""
|
|
|
|
|
|
|
254 |
except Exception as e:
|
255 |
logger.error(f"Error scraping full content from {url}: {e}")
|
256 |
return ""
|
@@ -298,7 +306,7 @@ from trafilatura.settings import use_config
|
|
298 |
from urllib.request import urlopen, Request
|
299 |
|
300 |
def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura", max_chars=3000, time_range="", language="all", category="",
|
301 |
-
engines=[], safesearch=2, method="GET", llm_temperature=0.2):
|
302 |
try:
|
303 |
# Step 1: Rephrase the Query
|
304 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
@@ -407,7 +415,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
407 |
config = use_config()
|
408 |
config.set("DEFAULT", "USER_AGENT", ua)
|
409 |
|
410 |
-
content = scrape_with_trafilatura(url, max_chars)
|
411 |
|
412 |
if content:
|
413 |
break
|
|
|
7 |
from requests.packages.urllib3.util.retry import Retry
|
8 |
from trafilatura import fetch_url, extract
|
9 |
from trafilatura import extract
|
10 |
+
from requests.exceptions import Timeout
|
11 |
from trafilatura.settings import use_config
|
12 |
from urllib.request import urlopen, Request
|
13 |
import json
|
|
|
88 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
89 |
return ""
|
90 |
|
91 |
+
def scrape_with_trafilatura(url, max_chars=None, timeout=10):
|
92 |
try:
|
93 |
+
response = requests.get(url, timeout=timeout)
|
94 |
+
response.raise_for_status()
|
95 |
+
downloaded = response.text
|
96 |
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
97 |
return (content or "")[:max_chars] if max_chars else (content or "")
|
98 |
+
except Timeout:
|
99 |
+
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
100 |
+
return ""
|
101 |
except Exception as e:
|
102 |
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
103 |
return ""
|
|
|
234 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
235 |
return "Error: Unable to assess relevance and summarize"
|
236 |
|
237 |
+
def scrape_full_content(url, scraper="trafilatura", max_chars=3000, timeout=10):
|
238 |
try:
|
239 |
logger.info(f"Scraping full content from: {url}")
|
240 |
|
241 |
if scraper == "bs4":
|
242 |
session = requests_retry_session()
|
243 |
+
response = session.get(url, timeout=timeout)
|
244 |
response.raise_for_status()
|
245 |
soup = BeautifulSoup(response.content, 'html.parser')
|
246 |
|
|
|
252 |
else:
|
253 |
content = soup.get_text(strip=True, separator='\n')
|
254 |
else: # trafilatura
|
255 |
+
content = scrape_with_trafilatura(url, max_chars, timeout)
|
|
|
256 |
|
257 |
# Limit the content to max_chars
|
258 |
return content[:max_chars] if content else ""
|
259 |
+
except Timeout:
|
260 |
+
logger.error(f"Timeout error while scraping full content from {url}")
|
261 |
+
return ""
|
262 |
except Exception as e:
|
263 |
logger.error(f"Error scraping full content from {url}: {e}")
|
264 |
return ""
|
|
|
306 |
from urllib.request import urlopen, Request
|
307 |
|
308 |
def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura", max_chars=3000, time_range="", language="all", category="",
|
309 |
+
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=10):
|
310 |
try:
|
311 |
# Step 1: Rephrase the Query
|
312 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
|
|
415 |
config = use_config()
|
416 |
config.set("DEFAULT", "USER_AGENT", ua)
|
417 |
|
418 |
+
content = scrape_with_trafilatura(url, max_chars, timeout=timeout)
|
419 |
|
420 |
if content:
|
421 |
break
|