File size: 3,772 Bytes
6c5c116
 
 
323db03
 
 
 
6c5c116
323db03
6c5c116
 
 
 
323db03
 
6c5c116
323db03
 
 
 
6c5c116
323db03
 
 
 
 
 
 
6c5c116
 
323db03
6c5c116
323db03
6c5c116
323db03
 
6c5c116
 
 
 
 
 
 
 
323db03
6c5c116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323db03
6c5c116
 
 
 
323db03
6c5c116
 
 
323db03
6c5c116
 
323db03
6c5c116
323db03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c5c116
 
 
 
 
323db03
 
 
 
 
 
6c5c116
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import re
import logging
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from bs4 import BeautifulSoup
from urllib.parse import urlparse

logger = logging.getLogger(__name__)


class WebsiteSpider(CrawlSpider):
    name = "website_spider"

    def __init__(self, start_url, output_dir, *args, **kwargs):
        self.start_urls = [start_url]
        self.allowed_domains = [urlparse(start_url).netloc]
        self.output_dir = output_dir

        # Define rules for link extraction
        self.rules = (
            Rule(
                LinkExtractor(allow_domains=self.allowed_domains),
                callback="parse_item",
                follow=True,
            ),
        )

        super().__init__(*args, **kwargs)

    def parse_item(self, response):
        try:
            # Parse the HTML with BeautifulSoup
            soup = BeautifulSoup(response.body, "html.parser")

            # Extract the title
            title = soup.title.string if soup.title else "No Title"

            # Clean the filename
            filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
            filepath = os.path.join(self.output_dir, filename)

            # Extract main content
            main_content = (
                soup.find("main")
                or soup.find("article")
                or soup.find("div", class_="content")
            )

            # If we found main content, extract the text
            if main_content:
                text_content = main_content.get_text(separator="\n", strip=True)
            else:
                # Fallback to body text
                text_content = (
                    soup.body.get_text(separator="\n", strip=True)
                    if soup.body
                    else "No content"
                )
                logger.warning(
                    f"No main content found for {response.url}, falling back to body text"
                )

            # Save the extracted content
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(f"URL: {response.url}\n")
                f.write(f"Title: {title}\n\n")
                f.write(text_content)

            logger.info(f"Saved content from {response.url} to {filepath}")

        except Exception as e:
            logger.error(f"Error processing {response.url}: {e}", exc_info=True)


class DomainCrawler:
    def __init__(self, start_url, output_dir="crawled_content"):
        self.start_url = start_url
        self.domain = urlparse(start_url).netloc
        self.output_dir = output_dir

        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            logger.info(f"Created output directory: {output_dir}")

        # Configure Scrapy settings
        self.settings = get_project_settings()
        self.settings.update(
            {
                "BOT_NAME": "website_crawler",
                "ROBOTSTXT_OBEY": True,
                "CONCURRENT_REQUESTS": 16,
                "DOWNLOAD_DELAY": 1,
                "COOKIES_ENABLED": False,
                "USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)",
            }
        )

    def start(self):
        """Start the crawling process"""
        logger.info(f"Starting crawl from {self.start_url}")

        process = CrawlerProcess(self.settings)
        process.crawl(
            WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir
        )
        process.start()

        logger.info("\nCrawl completed!")
        logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")