Shreyas094
commited on
Commit
•
1a81bf1
1
Parent(s):
6400d84
Update app.py
Browse files
app.py
CHANGED
@@ -69,7 +69,7 @@ def is_valid_url(url):
|
|
69 |
except ValueError:
|
70 |
return False
|
71 |
|
72 |
-
def scrape_with_bs4(url, session):
|
73 |
try:
|
74 |
response = session.get(url, timeout=5)
|
75 |
response.raise_for_status()
|
@@ -78,20 +78,20 @@ def scrape_with_bs4(url, session):
|
|
78 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
79 |
|
80 |
if main_content:
|
81 |
-
content = main_content.get_text(strip=True)
|
82 |
else:
|
83 |
-
content = soup.get_text(strip=True)
|
84 |
|
85 |
-
return content
|
86 |
except Exception as e:
|
87 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
88 |
return ""
|
89 |
|
90 |
-
def scrape_with_trafilatura(url):
|
91 |
try:
|
92 |
downloaded = fetch_url(url)
|
93 |
-
content = extract(downloaded)
|
94 |
-
return content or ""
|
95 |
except Exception as e:
|
96 |
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
97 |
return ""
|
@@ -371,72 +371,71 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
371 |
break
|
372 |
|
373 |
for result in results:
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
# Configure trafilatura to use a specific user agent
|
407 |
-
config = use_config()
|
408 |
-
config.set("DEFAULT", "USER_AGENT", ua)
|
409 |
-
|
410 |
-
content = extract(downloaded, config=config)
|
411 |
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
|
|
422 |
continue
|
423 |
-
|
424 |
-
|
425 |
-
|
|
|
426 |
continue
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
|
|
|
|
|
|
440 |
|
441 |
page += 1
|
442 |
|
|
|
69 |
except ValueError:
|
70 |
return False
|
71 |
|
72 |
+
def scrape_with_bs4(url, session, max_chars=None):
|
73 |
try:
|
74 |
response = session.get(url, timeout=5)
|
75 |
response.raise_for_status()
|
|
|
78 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
79 |
|
80 |
if main_content:
|
81 |
+
content = main_content.get_text(strip=True, separator='\n')
|
82 |
else:
|
83 |
+
content = soup.get_text(strip=True, separator='\n')
|
84 |
|
85 |
+
return content[:max_chars] if max_chars else content
|
86 |
except Exception as e:
|
87 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
88 |
return ""
|
89 |
|
90 |
+
def scrape_with_trafilatura(url, max_chars=None):
|
91 |
try:
|
92 |
downloaded = fetch_url(url)
|
93 |
+
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
94 |
+
return (content or "")[:max_chars] if max_chars else (content or "")
|
95 |
except Exception as e:
|
96 |
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
97 |
return ""
|
|
|
371 |
break
|
372 |
|
373 |
for result in results:
|
374 |
+
if len(scraped_content) >= num_results:
|
375 |
+
break
|
376 |
+
|
377 |
+
url = result.get('url', '')
|
378 |
+
title = result.get('title', 'No title')
|
379 |
+
|
380 |
+
if not is_valid_url(url):
|
381 |
+
logger.warning(f"Invalid URL: {url}")
|
382 |
+
continue
|
383 |
+
|
384 |
+
try:
|
385 |
+
logger.info(f"Scraping content from: {url}")
|
386 |
+
|
387 |
+
# Implement a retry mechanism with different user agents
|
388 |
+
user_agents = [
|
389 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
390 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
391 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
392 |
+
]
|
393 |
+
|
394 |
+
content = ""
|
395 |
+
for ua in user_agents:
|
396 |
+
try:
|
397 |
+
if scraper == "bs4":
|
398 |
+
session.headers.update({'User-Agent': ua})
|
399 |
+
content = scrape_with_bs4(url, session, max_chars)
|
400 |
+
else: # trafilatura
|
401 |
+
# Use urllib to handle custom headers for trafilatura
|
402 |
+
req = Request(url, headers={'User-Agent': ua})
|
403 |
+
with urlopen(req) as response:
|
404 |
+
downloaded = response.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
|
406 |
+
# Configure trafilatura to use a specific user agent
|
407 |
+
config = use_config()
|
408 |
+
config.set("DEFAULT", "USER_AGENT", ua)
|
409 |
+
|
410 |
+
content = scrape_with_trafilatura(url, max_chars)
|
411 |
+
|
412 |
+
if content:
|
413 |
+
break
|
414 |
+
except requests.exceptions.HTTPError as e:
|
415 |
+
if e.response.status_code == 403:
|
416 |
+
logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
|
417 |
continue
|
418 |
+
else:
|
419 |
+
raise
|
420 |
+
except Exception as e:
|
421 |
+
logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
|
422 |
continue
|
423 |
+
|
424 |
+
if not content:
|
425 |
+
logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
|
426 |
+
continue
|
427 |
+
|
428 |
+
scraped_content.append({
|
429 |
+
"title": title,
|
430 |
+
"url": url,
|
431 |
+
"content": content, # No need to slice here as it's already limited
|
432 |
+
"scraper": scraper
|
433 |
+
})
|
434 |
+
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
435 |
+
except requests.exceptions.RequestException as e:
|
436 |
+
logger.error(f"Error scraping {url}: {e}")
|
437 |
+
except Exception as e:
|
438 |
+
logger.error(f"Unexpected error while scraping {url}: {e}")
|
439 |
|
440 |
page += 1
|
441 |
|