SimpliFi / backend /app /crawler.py
Rsr2425's picture
Added initial web crawler code
6c5c116
raw
history blame
3.11 kB
from spidy import crawler
import os
import re
import logging
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
logger = logging.getLogger(__name__)
class DomainCrawler:
def __init__(self, start_url, output_dir="crawled_content"):
self.start_url = start_url
self.domain = urlparse(start_url).netloc
self.output_dir = output_dir
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.info(f"Created output directory: {output_dir}")
# Initialize the crawler
self.crawler = crawler.Crawler(
start_url=start_url,
max_pages=1000,
timeout=10,
delay=0.5,
save_pages=True,
save_path=output_dir,
restrict_domain=True,
verbose=True,
)
# Set custom handlers
self.crawler.page_handler = self.process_page
def process_page(self, url, content):
"""Custom page processor that extracts and saves content"""
try:
# Parse the HTML
soup = BeautifulSoup(content, "html.parser")
# Extract the title
title = soup.title.string if soup.title else "No Title"
# Clean the filename
filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
filepath = os.path.join(self.output_dir, filename)
# Extract main content (this is just an example - adjust for your site)
main_content = (
soup.find("main")
or soup.find("article")
or soup.find("div", class_="content")
)
# If we found main content, extract the text
if main_content:
text_content = main_content.get_text(separator="\n", strip=True)
else:
# Fallback to body text
text_content = (
soup.body.get_text(separator="\n", strip=True)
if soup.body
else "No content"
)
logger.warning(
f"No main content found for {url}, falling back to body text"
)
# Save the extracted content
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"URL: {url}\n")
f.write(f"Title: {title}\n\n")
f.write(text_content)
logger.info(f"Saved content from {url} to {filepath}")
except Exception as e:
logger.error(f"Error processing {url}: {e}", exc_info=True)
return content # Return the original content for the crawler to continue
def start(self):
"""Start the crawling process"""
logger.info(f"Starting crawl from {self.start_url}")
self.crawler.crawl()
# Print summary
logger.info("\nCrawl completed!")
logger.info(f"Pages crawled: {len(self.crawler.links_crawled)}")
logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")