File size: 5,738 Bytes
6c5c116 323db03 6c5c116 323db03 6c5c116 e344fab 6c5c116 323db03 e344fab 26b3c2a e344fab 323db03 6c5c116 323db03 e344fab 26b3c2a 323db03 6c5c116 e344fab 323db03 6c5c116 323db03 e344fab 6c5c116 323db03 6c5c116 323db03 6c5c116 323db03 6c5c116 323db03 6c5c116 323db03 6c5c116 323db03 6c5c116 323db03 e344fab 323db03 e344fab 323db03 6c5c116 e344fab 26b3c2a e344fab 6c5c116 323db03 6c5c116 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import os
import re
import logging
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from bs4 import BeautifulSoup
from urllib.parse import urlparse
"""
A web crawler module for extracting content from documentation websites.
This module provides classes for crawling a domain and extracting main content
from web pages, saving the results as text files.
"""
logger = logging.getLogger(__name__)
class WebsiteSpider(CrawlSpider):
"""
A Scrapy spider for crawling documentation websites and extracting content.
This spider follows links within the allowed domain and extracts the main content
from each page, saving it to a text file. It attempts to find content within <main>,
<article> or content div tags, falling back to the full body if none are found.
Args:
start_url (str): The URL where crawling should begin
output_dir (str): Directory where extracted content should be saved
*args: Additional positional arguments passed to CrawlSpider
**kwargs: Additional keyword arguments passed to CrawlSpider
"""
name = "website_spider"
def __init__(self, start_url, output_dir, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = [start_url]
self.allowed_domains = [urlparse(start_url).netloc]
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Created output directory: {output_dir}")
self.rules = (
Rule(
LinkExtractor(allow_domains=self.allowed_domains),
callback="parse_item",
follow=True,
),
)
def parse_item(self, response):
"""
Parse a webpage and extract its content.
Args:
response: The Scrapy response object containing the webpage
The extracted content is saved to a text file in the output directory,
including the page URL, title and main content.
"""
try:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(response.body, "html.parser")
# Extract the title
title = soup.title.string if soup.title else "No Title"
# Clean the filename
filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
filepath = os.path.join(self.output_dir, filename)
# Extract main content
main_content = (
soup.find("main")
or soup.find("article")
or soup.find("div", class_="content")
)
# If we found main content, extract the text
if main_content:
text_content = main_content.get_text(separator="\n", strip=True)
else:
# Fallback to body text
text_content = (
soup.body.get_text(separator="\n", strip=True)
if soup.body
else "No content"
)
logger.warning(
f"No main content found for {response.url}, falling back to body text"
)
# Save the extracted content
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"URL: {response.url}\n")
f.write(f"Title: {title}\n\n")
f.write(text_content)
logger.info(f"Saved content from {response.url} to {filepath}")
except Exception as e:
logger.error(f"Error processing {response.url}: {e}", exc_info=True)
class DomainCrawler:
"""
High-level crawler class for extracting content from a documentation website.
This class provides a simple interface for crawling a website and extracting its
content. It configures and runs a Scrapy crawler with sensible defaults for
crawling documentation sites.
Example:
crawler = DomainCrawler("https://docs.example.com")
crawler.start() # Crawls the site and saves content to ./crawled_content/
Args:
start_url (str): The URL where crawling should begin
output_dir (str, optional): Directory where extracted content should be saved.
Defaults to "crawled_content"
"""
def __init__(self, start_url, output_dir="crawled_content"):
self.start_url = start_url
self.domain = urlparse(start_url).netloc
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Created output directory: {output_dir}")
# Configure Scrapy settings
self.settings = get_project_settings()
self.settings.update(
{
"BOT_NAME": "website_crawler",
"ROBOTSTXT_OBEY": True,
"CONCURRENT_REQUESTS": 16,
"DOWNLOAD_DELAY": 1,
"COOKIES_ENABLED": False,
"USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)",
}
)
def start(self):
"""
Start the crawling process.
This method initiates the crawler and blocks until crawling is complete.
The extracted content will be saved to the configured output directory.
"""
logger.info(f"Starting crawl from {self.start_url}")
process = CrawlerProcess(self.settings)
process.crawl(
WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir
)
process.start()
logger.info("\nCrawl completed!")
logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")
|