import os import re import logging from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from bs4 import BeautifulSoup from urllib.parse import urlparse """ A web crawler module for extracting content from documentation websites. This module provides classes for crawling a domain and extracting main content from web pages, saving the results as text files. """ logger = logging.getLogger(__name__) class WebsiteSpider(CrawlSpider): """ A Scrapy spider for crawling documentation websites and extracting content. This spider follows links within the allowed domain and extracts the main content from each page, saving it to a text file. It attempts to find content within
,
or content div tags, falling back to the full body if none are found. Args: start_url (str): The URL where crawling should begin output_dir (str): Directory where extracted content should be saved *args: Additional positional arguments passed to CrawlSpider **kwargs: Additional keyword arguments passed to CrawlSpider """ name = "website_spider" def __init__(self, start_url, output_dir, *args, **kwargs): super().__init__(*args, **kwargs) self.start_urls = [start_url] self.allowed_domains = [urlparse(start_url).netloc] self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) logger.info(f"Created output directory: {output_dir}") self.rules = ( Rule( LinkExtractor(allow_domains=self.allowed_domains), callback="parse_item", follow=True, ), ) def parse_item(self, response): """ Parse a webpage and extract its content. Args: response: The Scrapy response object containing the webpage The extracted content is saved to a text file in the output directory, including the page URL, title and main content. """ try: # Parse the HTML with BeautifulSoup soup = BeautifulSoup(response.body, "html.parser") # Extract the title title = soup.title.string if soup.title else "No Title" # Clean the filename filename = re.sub(r"[^\w\-_]", "_", title) + ".txt" filepath = os.path.join(self.output_dir, filename) # Extract main content main_content = ( soup.find("main") or soup.find("article") or soup.find("div", class_="content") ) # If we found main content, extract the text if main_content: text_content = main_content.get_text(separator="\n", strip=True) else: # Fallback to body text text_content = ( soup.body.get_text(separator="\n", strip=True) if soup.body else "No content" ) logger.warning( f"No main content found for {response.url}, falling back to body text" ) # Save the extracted content with open(filepath, "w", encoding="utf-8") as f: f.write(f"URL: {response.url}\n") f.write(f"Title: {title}\n\n") f.write(text_content) logger.info(f"Saved content from {response.url} to {filepath}") except Exception as e: logger.error(f"Error processing {response.url}: {e}", exc_info=True) class DomainCrawler: """ High-level crawler class for extracting content from a documentation website. This class provides a simple interface for crawling a website and extracting its content. It configures and runs a Scrapy crawler with sensible defaults for crawling documentation sites. Example: crawler = DomainCrawler("https://docs.example.com") crawler.start() # Crawls the site and saves content to ./crawled_content/ Args: start_url (str): The URL where crawling should begin output_dir (str, optional): Directory where extracted content should be saved. Defaults to "crawled_content" """ def __init__(self, start_url, output_dir="crawled_content"): self.start_url = start_url self.domain = urlparse(start_url).netloc self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) logger.info(f"Created output directory: {output_dir}") # Configure Scrapy settings self.settings = get_project_settings() self.settings.update( { "BOT_NAME": "website_crawler", "ROBOTSTXT_OBEY": True, "CONCURRENT_REQUESTS": 16, "DOWNLOAD_DELAY": 1, "COOKIES_ENABLED": False, "USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)", } ) def start(self): """ Start the crawling process. This method initiates the crawler and blocks until crawling is complete. The extracted content will be saved to the configured output directory. """ logger.info(f"Starting crawl from {self.start_url}") process = CrawlerProcess(self.settings) process.crawl( WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir ) process.start() logger.info("\nCrawl completed!") logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")