File size: 5,738 Bytes
6c5c116
 
 
323db03
 
 
 
6c5c116
323db03
6c5c116
e344fab
 
 
 
 
 
6c5c116
 
 
323db03
e344fab
 
26b3c2a
e344fab
 
 
 
 
 
 
 
 
 
 
323db03
6c5c116
323db03
e344fab
26b3c2a
323db03
 
 
6c5c116
e344fab
 
 
323db03
 
 
 
 
 
6c5c116
 
323db03
e344fab
 
 
 
 
 
 
 
 
6c5c116
323db03
 
6c5c116
 
 
 
 
 
 
 
323db03
6c5c116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323db03
6c5c116
 
 
 
323db03
6c5c116
 
 
323db03
6c5c116
 
323db03
6c5c116
323db03
 
e344fab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323db03
 
 
 
 
e344fab
 
323db03
 
 
 
 
 
 
 
 
 
 
 
 
6c5c116
 
e344fab
 
26b3c2a
e344fab
 
 
6c5c116
 
323db03
 
 
 
 
 
6c5c116
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os
import re
import logging
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from bs4 import BeautifulSoup
from urllib.parse import urlparse

"""
A web crawler module for extracting content from documentation websites.
This module provides classes for crawling a domain and extracting main content
from web pages, saving the results as text files.
"""

logger = logging.getLogger(__name__)


class WebsiteSpider(CrawlSpider):
    """
    A Scrapy spider for crawling documentation websites and extracting content.

    This spider follows links within the allowed domain and extracts the main content
    from each page, saving it to a text file. It attempts to find content within <main>,
    <article> or content div tags, falling back to the full body if none are found.

    Args:
        start_url (str): The URL where crawling should begin
        output_dir (str): Directory where extracted content should be saved
        *args: Additional positional arguments passed to CrawlSpider
        **kwargs: Additional keyword arguments passed to CrawlSpider
    """

    name = "website_spider"

    def __init__(self, start_url, output_dir, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.start_urls = [start_url]
        self.allowed_domains = [urlparse(start_url).netloc]
        self.output_dir = output_dir

        os.makedirs(output_dir, exist_ok=True)
        logger.info(f"Created output directory: {output_dir}")

        self.rules = (
            Rule(
                LinkExtractor(allow_domains=self.allowed_domains),
                callback="parse_item",
                follow=True,
            ),
        )

    def parse_item(self, response):
        """
        Parse a webpage and extract its content.

        Args:
            response: The Scrapy response object containing the webpage

        The extracted content is saved to a text file in the output directory,
        including the page URL, title and main content.
        """
        try:
            # Parse the HTML with BeautifulSoup
            soup = BeautifulSoup(response.body, "html.parser")

            # Extract the title
            title = soup.title.string if soup.title else "No Title"

            # Clean the filename
            filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
            filepath = os.path.join(self.output_dir, filename)

            # Extract main content
            main_content = (
                soup.find("main")
                or soup.find("article")
                or soup.find("div", class_="content")
            )

            # If we found main content, extract the text
            if main_content:
                text_content = main_content.get_text(separator="\n", strip=True)
            else:
                # Fallback to body text
                text_content = (
                    soup.body.get_text(separator="\n", strip=True)
                    if soup.body
                    else "No content"
                )
                logger.warning(
                    f"No main content found for {response.url}, falling back to body text"
                )

            # Save the extracted content
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(f"URL: {response.url}\n")
                f.write(f"Title: {title}\n\n")
                f.write(text_content)

            logger.info(f"Saved content from {response.url} to {filepath}")

        except Exception as e:
            logger.error(f"Error processing {response.url}: {e}", exc_info=True)


class DomainCrawler:
    """
    High-level crawler class for extracting content from a documentation website.

    This class provides a simple interface for crawling a website and extracting its
    content. It configures and runs a Scrapy crawler with sensible defaults for
    crawling documentation sites.

    Example:
        crawler = DomainCrawler("https://docs.example.com")
        crawler.start()  # Crawls the site and saves content to ./crawled_content/

    Args:
        start_url (str): The URL where crawling should begin
        output_dir (str, optional): Directory where extracted content should be saved.
            Defaults to "crawled_content"
    """

    def __init__(self, start_url, output_dir="crawled_content"):
        self.start_url = start_url
        self.domain = urlparse(start_url).netloc
        self.output_dir = output_dir

        os.makedirs(output_dir, exist_ok=True)
        logger.info(f"Created output directory: {output_dir}")

        # Configure Scrapy settings
        self.settings = get_project_settings()
        self.settings.update(
            {
                "BOT_NAME": "website_crawler",
                "ROBOTSTXT_OBEY": True,
                "CONCURRENT_REQUESTS": 16,
                "DOWNLOAD_DELAY": 1,
                "COOKIES_ENABLED": False,
                "USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)",
            }
        )

    def start(self):
        """
        Start the crawling process.

        This method initiates the crawler and blocks until crawling is complete.
        The extracted content will be saved to the configured output directory.
        """
        logger.info(f"Starting crawl from {self.start_url}")

        process = CrawlerProcess(self.settings)
        process.crawl(
            WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir
        )
        process.start()

        logger.info("\nCrawl completed!")
        logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")