|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.robotparser import RobotFileParser |
|
from urllib.parse import urlparse, urljoin |
|
import time |
|
import logging |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
import re |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
class WebScraper: |
|
def __init__(self, user_agent="WebLLMAssistant/1.0 (+https://github.com/YourUsername/Web-LLM-Assistant-Llama-cpp)", |
|
rate_limit=1, timeout=10, max_retries=3): |
|
self.session = requests.Session() |
|
self.session.headers.update({"User-Agent": user_agent}) |
|
self.robot_parser = RobotFileParser() |
|
self.rate_limit = rate_limit |
|
self.timeout = timeout |
|
self.max_retries = max_retries |
|
self.last_request_time = {} |
|
|
|
def can_fetch(self, url): |
|
parsed_url = urlparse(url) |
|
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" |
|
self.robot_parser.set_url(robots_url) |
|
try: |
|
self.robot_parser.read() |
|
return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url) |
|
except Exception as e: |
|
logger.warning(f"Error reading robots.txt for {url}: {e}") |
|
return True |
|
|
|
def respect_rate_limit(self, url): |
|
domain = urlparse(url).netloc |
|
current_time = time.time() |
|
if domain in self.last_request_time: |
|
time_since_last_request = current_time - self.last_request_time[domain] |
|
if time_since_last_request < self.rate_limit: |
|
time.sleep(self.rate_limit - time_since_last_request) |
|
self.last_request_time[domain] = time.time() |
|
|
|
def scrape_page(self, url): |
|
if not self.can_fetch(url): |
|
logger.info(f"Robots.txt disallows scraping: {url}") |
|
return None |
|
|
|
for attempt in range(self.max_retries): |
|
try: |
|
self.respect_rate_limit(url) |
|
response = self.session.get(url, timeout=self.timeout) |
|
response.raise_for_status() |
|
return self.extract_content(response.text, url) |
|
except requests.RequestException as e: |
|
logger.warning(f"Error scraping {url} (attempt {attempt + 1}/{self.max_retries}): {e}") |
|
if attempt == self.max_retries - 1: |
|
logger.error(f"Failed to scrape {url} after {self.max_retries} attempts") |
|
return None |
|
time.sleep(2 ** attempt) |
|
|
|
def extract_content(self, html, url): |
|
soup = BeautifulSoup(html, 'html.parser') |
|
|
|
|
|
for element in soup(["script", "style", "nav", "footer", "header"]): |
|
element.decompose() |
|
|
|
|
|
title = soup.title.string if soup.title else "" |
|
|
|
|
|
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') |
|
|
|
if main_content: |
|
paragraphs = main_content.find_all('p') |
|
else: |
|
paragraphs = soup.find_all('p') |
|
|
|
|
|
text = ' '.join([p.get_text().strip() for p in paragraphs]) |
|
|
|
|
|
if not text: |
|
text = soup.get_text() |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
|
|
|
links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)] |
|
|
|
return { |
|
"url": url, |
|
"title": title, |
|
"content": text[:2400], |
|
"links": links[:10] |
|
} |
|
|
|
def scrape_multiple_pages(urls, max_workers=5): |
|
scraper = WebScraper() |
|
results = {} |
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor: |
|
future_to_url = {executor.submit(scraper.scrape_page, url): url for url in urls} |
|
for future in as_completed(future_to_url): |
|
url = future_to_url[future] |
|
try: |
|
data = future.result() |
|
if data: |
|
results[url] = data |
|
logger.info(f"Successfully scraped: {url}") |
|
else: |
|
logger.warning(f"Failed to scrape: {url}") |
|
except Exception as exc: |
|
logger.error(f"{url} generated an exception: {exc}") |
|
|
|
return results |
|
|
|
|
|
def get_web_content(urls): |
|
scraped_data = scrape_multiple_pages(urls) |
|
return {url: data['content'] for url, data in scraped_data.items() if data} |
|
|
|
|
|
def can_fetch(url): |
|
parsed_url = urlparse(url) |
|
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" |
|
rp = RobotFileParser() |
|
rp.set_url(robots_url) |
|
try: |
|
rp.read() |
|
return rp.can_fetch("*", url) |
|
except Exception as e: |
|
logger.warning(f"Error reading robots.txt for {url}: {e}") |
|
return True |
|
|
|
if __name__ == "__main__": |
|
test_urls = [ |
|
"https://en.wikipedia.org/wiki/Web_scraping", |
|
"https://example.com", |
|
"https://www.python.org" |
|
] |
|
scraped_content = get_web_content(test_urls) |
|
for url, content in scraped_content.items(): |
|
print(f"Content from {url}:") |
|
print(content[:500]) |
|
print("\n---\n") |
|
|