import pytest
import os
from unittest.mock import Mock, patch
from bs4 import BeautifulSoup
from scrapy.http import Response, Request, TextResponse
from backend.app.crawler import DomainCrawler, WebsiteSpider
@pytest.fixture
def sample_html():
return """
Test Page
Main Content
This is the main content of the page.
"""
@pytest.fixture
def crawler():
return DomainCrawler("https://example.com", output_dir="test_output")
@pytest.fixture
def spider():
return WebsiteSpider(start_url="https://example.com", output_dir="test_output")
# def test_crawler_initialization(crawler):
# assert crawler.start_url == "https://example.com"
# assert crawler.domain == "example.com"
# assert crawler.output_dir == "test_output"
# assert os.path.exists("test_output")
# # Test Scrapy settings
# assert crawler.settings.get('BOT_NAME') == 'website_crawler'
# assert crawler.settings.get('ROBOTSTXT_OBEY') is True
# assert crawler.settings.get('DOWNLOAD_DELAY') == 1
def create_response(url, body):
request = Request(url=url)
return TextResponse(
url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
)
# def test_spider_parse_with_main_content(spider, sample_html):
# url = "https://example.com/test"
# response = create_response(url, sample_html)
# # Process the page
# list(spider.parse_item(response))
# # Check if file was created
# files = os.listdir(spider.output_dir)
# assert len(files) == 1
# # Read the saved file
# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
# content = f.read()
# # Verify content
# assert "URL: https://example.com/test" in content
# assert "Title: Test Page" in content
# assert "Main Content" in content
# assert "This is the main content of the page." in content
# def test_spider_parse_without_main_content(spider):
# html_without_main = """
#
# No Main
#
# Some body content
#
#
# """
# url = "https://example.com/no-main"
# response = create_response(url, html_without_main)
# # Process the page
# list(spider.parse_item(response))
# files = os.listdir(spider.output_dir)
# assert len(files) == 1
# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
# content = f.read()
# assert "URL: https://example.com/no-main" in content
# assert "Title: No Main" in content
# assert "Some body content" in content
# def test_spider_parse_with_invalid_html(spider):
# invalid_html = "<"
# url = "https://example.com/invalid"
# response = create_response(url, invalid_html)
# # Process should not raise an exception
# list(spider.parse_item(response))
# # Should still create a file
# files = os.listdir(spider.output_dir)
# assert len(files) == 1
# @patch('scrapy.crawler.CrawlerProcess')
# def test_start_crawling(mock_crawler_process_class, crawler):
# # Configure the mock
# mock_process = Mock()
# mock_crawler_process_class.return_value = mock_process
# # Start crawling
# crawler.start()
# # Verify process was created with correct settings
# mock_crawler_process_class.assert_called_once_with(crawler.settings)
# # Verify crawl method was called
# mock_process.crawl.assert_called_once()
# mock_process.start.assert_called_once()
@pytest.fixture(autouse=True)
def cleanup():
# Setup - nothing needed
yield
# Cleanup after each test
if os.path.exists("test_output"):
for file in os.listdir("test_output"):
os.remove(os.path.join("test_output", file))
os.rmdir("test_output")