|
from spidy import crawler |
|
import os |
|
import re |
|
import logging |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urljoin, urlparse |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class DomainCrawler: |
|
def __init__(self, start_url, output_dir="crawled_content"): |
|
self.start_url = start_url |
|
self.domain = urlparse(start_url).netloc |
|
self.output_dir = output_dir |
|
|
|
|
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
logger.info(f"Created output directory: {output_dir}") |
|
|
|
|
|
self.crawler = crawler.Crawler( |
|
start_url=start_url, |
|
max_pages=1000, |
|
timeout=10, |
|
delay=0.5, |
|
save_pages=True, |
|
save_path=output_dir, |
|
restrict_domain=True, |
|
verbose=True, |
|
) |
|
|
|
|
|
self.crawler.page_handler = self.process_page |
|
|
|
def process_page(self, url, content): |
|
"""Custom page processor that extracts and saves content""" |
|
try: |
|
|
|
soup = BeautifulSoup(content, "html.parser") |
|
|
|
|
|
title = soup.title.string if soup.title else "No Title" |
|
|
|
|
|
filename = re.sub(r"[^\w\-_]", "_", title) + ".txt" |
|
filepath = os.path.join(self.output_dir, filename) |
|
|
|
|
|
main_content = ( |
|
soup.find("main") |
|
or soup.find("article") |
|
or soup.find("div", class_="content") |
|
) |
|
|
|
|
|
if main_content: |
|
text_content = main_content.get_text(separator="\n", strip=True) |
|
else: |
|
|
|
text_content = ( |
|
soup.body.get_text(separator="\n", strip=True) |
|
if soup.body |
|
else "No content" |
|
) |
|
logger.warning( |
|
f"No main content found for {url}, falling back to body text" |
|
) |
|
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f: |
|
f.write(f"URL: {url}\n") |
|
f.write(f"Title: {title}\n\n") |
|
f.write(text_content) |
|
|
|
logger.info(f"Saved content from {url} to {filepath}") |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing {url}: {e}", exc_info=True) |
|
|
|
return content |
|
|
|
def start(self): |
|
"""Start the crawling process""" |
|
logger.info(f"Starting crawl from {self.start_url}") |
|
self.crawler.crawl() |
|
|
|
|
|
logger.info("\nCrawl completed!") |
|
logger.info(f"Pages crawled: {len(self.crawler.links_crawled)}") |
|
logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}") |
|
|