Shreyas094
commited on
Commit
•
c6a0be6
1
Parent(s):
6552a74
Update app.py
Browse files
app.py
CHANGED
@@ -142,35 +142,50 @@ def scrape_with_bs4(url, session, max_chars=None):
|
|
142 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
143 |
return ""
|
144 |
|
145 |
-
def scrape_with_trafilatura(url, max_chars=None, timeout=5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
try:
|
|
|
147 |
response = requests.get(url, timeout=timeout)
|
148 |
response.raise_for_status()
|
149 |
-
downloaded = response.text
|
150 |
-
content = ""
|
151 |
-
|
152 |
-
if use_beautifulsoup:
|
153 |
-
soup = BeautifulSoup(downloaded, "lxml")
|
154 |
-
# Convert BeautifulSoup object to a string
|
155 |
-
html_string = str(soup)
|
156 |
-
# Use Trafilatura's extract function directly on the HTML string
|
157 |
-
content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
|
158 |
|
159 |
-
#
|
160 |
-
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
163 |
|
164 |
-
# If
|
165 |
if not content:
|
166 |
-
content = extract(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
-
return (content or "")[:max_chars] if max_chars else (content or "")
|
169 |
except requests.Timeout:
|
170 |
-
logger.error(f"Timeout error while scraping {url}
|
171 |
return ""
|
172 |
except Exception as e:
|
173 |
-
logger.error(f"Error scraping {url}
|
174 |
return ""
|
175 |
|
176 |
def rephrase_query(chat_history, query, temperature=0.2):
|
@@ -307,9 +322,23 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
307 |
return "Error: Unable to assess relevance and summarize"
|
308 |
|
309 |
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
try:
|
311 |
logger.info(f"Scraping full content from: {url}")
|
312 |
|
|
|
|
|
313 |
if scraper == "bs4":
|
314 |
session = requests_retry_session()
|
315 |
response = session.get(url, timeout=timeout)
|
@@ -317,26 +346,39 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
|
317 |
soup = BeautifulSoup(response.content, 'html.parser')
|
318 |
|
319 |
# Try to find the main content
|
320 |
-
main_content =
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
|
322 |
-
if main_content:
|
323 |
-
content = main_content.get_text(strip=True, separator='\n')
|
324 |
-
else:
|
325 |
-
content = soup.get_text(strip=True, separator='\n')
|
326 |
elif scraper == "trafilatura":
|
327 |
-
content = scrape_with_trafilatura(url, max_chars, timeout
|
|
|
328 |
elif scraper == "scrapy":
|
329 |
content = scrape_with_scrapy(url, timeout)
|
|
|
330 |
elif scraper == "newspaper":
|
331 |
-
|
|
|
|
|
|
|
|
|
332 |
else:
|
333 |
logger.error(f"Unknown scraper: {scraper}")
|
334 |
return ""
|
335 |
|
336 |
-
#
|
337 |
-
|
338 |
-
|
339 |
-
|
|
|
|
|
|
|
|
|
|
|
340 |
return ""
|
341 |
except Exception as e:
|
342 |
logger.error(f"Error scraping full content from {url}: {e}")
|
|
|
142 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
143 |
return ""
|
144 |
|
145 |
+
def scrape_with_trafilatura(url, max_chars=None, timeout=5):
|
146 |
+
"""
|
147 |
+
Scrape web content using Trafilatura with simplified error handling and fallback options.
|
148 |
+
|
149 |
+
Args:
|
150 |
+
url (str): The URL to scrape
|
151 |
+
max_chars (int, optional): Maximum number of characters to return
|
152 |
+
timeout (int, optional): Request timeout in seconds
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
str: Extracted content or empty string if extraction fails
|
156 |
+
"""
|
157 |
try:
|
158 |
+
# Make the request with timeout
|
159 |
response = requests.get(url, timeout=timeout)
|
160 |
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
+
# Extract content from the downloaded HTML
|
163 |
+
content = extract(
|
164 |
+
response.text,
|
165 |
+
include_comments=False,
|
166 |
+
include_tables=True,
|
167 |
+
no_fallback=False
|
168 |
+
)
|
169 |
|
170 |
+
# If first attempt fails, try direct URL extraction
|
171 |
if not content:
|
172 |
+
content = extract(
|
173 |
+
url,
|
174 |
+
include_comments=False,
|
175 |
+
include_tables=True,
|
176 |
+
no_fallback=False
|
177 |
+
)
|
178 |
+
|
179 |
+
# Return content with optional length limit
|
180 |
+
if content and max_chars:
|
181 |
+
return content[:max_chars]
|
182 |
+
return content or ""
|
183 |
|
|
|
184 |
except requests.Timeout:
|
185 |
+
logger.error(f"Timeout error while scraping {url}")
|
186 |
return ""
|
187 |
except Exception as e:
|
188 |
+
logger.error(f"Error scraping {url}: {str(e)}")
|
189 |
return ""
|
190 |
|
191 |
def rephrase_query(chat_history, query, temperature=0.2):
|
|
|
322 |
return "Error: Unable to assess relevance and summarize"
|
323 |
|
324 |
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
325 |
+
"""
|
326 |
+
Unified content scraper that supports multiple scraping methods.
|
327 |
+
|
328 |
+
Args:
|
329 |
+
url (str): The URL to scrape
|
330 |
+
scraper (str): Scraping method to use ('bs4', 'trafilatura', 'scrapy', 'newspaper')
|
331 |
+
max_chars (int): Maximum number of characters to return
|
332 |
+
timeout (int): Request timeout in seconds
|
333 |
+
|
334 |
+
Returns:
|
335 |
+
str: Scraped content or empty string if scraping fails
|
336 |
+
"""
|
337 |
try:
|
338 |
logger.info(f"Scraping full content from: {url}")
|
339 |
|
340 |
+
content = ""
|
341 |
+
|
342 |
if scraper == "bs4":
|
343 |
session = requests_retry_session()
|
344 |
response = session.get(url, timeout=timeout)
|
|
|
346 |
soup = BeautifulSoup(response.content, 'html.parser')
|
347 |
|
348 |
# Try to find the main content
|
349 |
+
main_content = (
|
350 |
+
soup.find('main') or
|
351 |
+
soup.find('article') or
|
352 |
+
soup.find('div', class_='content')
|
353 |
+
)
|
354 |
+
|
355 |
+
content = main_content.get_text(strip=True, separator='\n') if main_content else soup.get_text(strip=True, separator='\n')
|
356 |
|
|
|
|
|
|
|
|
|
357 |
elif scraper == "trafilatura":
|
358 |
+
content = scrape_with_trafilatura(url, max_chars, timeout)
|
359 |
+
|
360 |
elif scraper == "scrapy":
|
361 |
content = scrape_with_scrapy(url, timeout)
|
362 |
+
|
363 |
elif scraper == "newspaper":
|
364 |
+
article = Article(url)
|
365 |
+
article.download()
|
366 |
+
article.parse()
|
367 |
+
content = article.text
|
368 |
+
|
369 |
else:
|
370 |
logger.error(f"Unknown scraper: {scraper}")
|
371 |
return ""
|
372 |
|
373 |
+
# Standardize whitespace and limit content length
|
374 |
+
if content:
|
375 |
+
content = " ".join(content.split()) # Standardize whitespace
|
376 |
+
return content[:max_chars] if max_chars else content
|
377 |
+
|
378 |
+
return ""
|
379 |
+
|
380 |
+
except requests.Timeout:
|
381 |
+
logger.error(f"Timeout error while scraping {url}")
|
382 |
return ""
|
383 |
except Exception as e:
|
384 |
logger.error(f"Error scraping full content from {url}: {e}")
|