SimpliFi / backend /app /crawler.py
Rsr2425's picture
Fix broken test and code style
26b3c2a
import os
import re
import logging
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from bs4 import BeautifulSoup
from urllib.parse import urlparse
"""
A web crawler module for extracting content from documentation websites.
This module provides classes for crawling a domain and extracting main content
from web pages, saving the results as text files.
"""
logger = logging.getLogger(__name__)
class WebsiteSpider(CrawlSpider):
"""
A Scrapy spider for crawling documentation websites and extracting content.
This spider follows links within the allowed domain and extracts the main content
from each page, saving it to a text file. It attempts to find content within <main>,
<article> or content div tags, falling back to the full body if none are found.
Args:
start_url (str): The URL where crawling should begin
output_dir (str): Directory where extracted content should be saved
*args: Additional positional arguments passed to CrawlSpider
**kwargs: Additional keyword arguments passed to CrawlSpider
"""
name = "website_spider"
def __init__(self, start_url, output_dir, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = [start_url]
self.allowed_domains = [urlparse(start_url).netloc]
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Created output directory: {output_dir}")
self.rules = (
Rule(
LinkExtractor(allow_domains=self.allowed_domains),
callback="parse_item",
follow=True,
),
)
def parse_item(self, response):
"""
Parse a webpage and extract its content.
Args:
response: The Scrapy response object containing the webpage
The extracted content is saved to a text file in the output directory,
including the page URL, title and main content.
"""
try:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(response.body, "html.parser")
# Extract the title
title = soup.title.string if soup.title else "No Title"
# Clean the filename
filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
filepath = os.path.join(self.output_dir, filename)
# Extract main content
main_content = (
soup.find("main")
or soup.find("article")
or soup.find("div", class_="content")
)
# If we found main content, extract the text
if main_content:
text_content = main_content.get_text(separator="\n", strip=True)
else:
# Fallback to body text
text_content = (
soup.body.get_text(separator="\n", strip=True)
if soup.body
else "No content"
)
logger.warning(
f"No main content found for {response.url}, falling back to body text"
)
# Save the extracted content
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"URL: {response.url}\n")
f.write(f"Title: {title}\n\n")
f.write(text_content)
logger.info(f"Saved content from {response.url} to {filepath}")
except Exception as e:
logger.error(f"Error processing {response.url}: {e}", exc_info=True)
class DomainCrawler:
"""
High-level crawler class for extracting content from a documentation website.
This class provides a simple interface for crawling a website and extracting its
content. It configures and runs a Scrapy crawler with sensible defaults for
crawling documentation sites.
Example:
crawler = DomainCrawler("https://docs.example.com")
crawler.start() # Crawls the site and saves content to ./crawled_content/
Args:
start_url (str): The URL where crawling should begin
output_dir (str, optional): Directory where extracted content should be saved.
Defaults to "crawled_content"
"""
def __init__(self, start_url, output_dir="crawled_content"):
self.start_url = start_url
self.domain = urlparse(start_url).netloc
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Created output directory: {output_dir}")
# Configure Scrapy settings
self.settings = get_project_settings()
self.settings.update(
{
"BOT_NAME": "website_crawler",
"ROBOTSTXT_OBEY": True,
"CONCURRENT_REQUESTS": 16,
"DOWNLOAD_DELAY": 1,
"COOKIES_ENABLED": False,
"USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)",
}
)
def start(self):
"""
Start the crawling process.
This method initiates the crawler and blocks until crawling is complete.
The extracted content will be saved to the configured output directory.
"""
logger.info(f"Starting crawl from {self.start_url}")
process = CrawlerProcess(self.settings)
process.crawl(
WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir
)
process.start()
logger.info("\nCrawl completed!")
logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")