import pytest | |
import os | |
from unittest.mock import Mock, patch | |
from bs4 import BeautifulSoup | |
from scrapy.http import Response, Request, TextResponse | |
from backend.app.crawler import DomainCrawler, WebsiteSpider | |
def sample_html(): | |
return """ | |
<html> | |
<head><title>Test Page</title></head> | |
<body> | |
<main> | |
<h1>Main Content</h1> | |
<p>This is the main content of the page.</p> | |
</main> | |
</body> | |
</html> | |
""" | |
def crawler(): | |
return DomainCrawler("https://example.com", output_dir="test_output") | |
def spider(): | |
return WebsiteSpider(start_url="https://example.com", output_dir="test_output") | |
# def test_crawler_initialization(crawler): | |
# assert crawler.start_url == "https://example.com" | |
# assert crawler.domain == "example.com" | |
# assert crawler.output_dir == "test_output" | |
# assert os.path.exists("test_output") | |
# # Test Scrapy settings | |
# assert crawler.settings.get('BOT_NAME') == 'website_crawler' | |
# assert crawler.settings.get('ROBOTSTXT_OBEY') is True | |
# assert crawler.settings.get('DOWNLOAD_DELAY') == 1 | |
def create_response(url, body): | |
request = Request(url=url) | |
return TextResponse( | |
url=url, body=body.encode("utf-8"), encoding="utf-8", request=request | |
) | |
# def test_spider_parse_with_main_content(spider, sample_html): | |
# url = "https://example.com/test" | |
# response = create_response(url, sample_html) | |
# # Process the page | |
# list(spider.parse_item(response)) | |
# # Check if file was created | |
# files = os.listdir(spider.output_dir) | |
# assert len(files) == 1 | |
# # Read the saved file | |
# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f: | |
# content = f.read() | |
# # Verify content | |
# assert "URL: https://example.com/test" in content | |
# assert "Title: Test Page" in content | |
# assert "Main Content" in content | |
# assert "This is the main content of the page." in content | |
# def test_spider_parse_without_main_content(spider): | |
# html_without_main = """ | |
# <html> | |
# <head><title>No Main</title></head> | |
# <body> | |
# <div>Some body content</div> | |
# </body> | |
# </html> | |
# """ | |
# url = "https://example.com/no-main" | |
# response = create_response(url, html_without_main) | |
# # Process the page | |
# list(spider.parse_item(response)) | |
# files = os.listdir(spider.output_dir) | |
# assert len(files) == 1 | |
# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f: | |
# content = f.read() | |
# assert "URL: https://example.com/no-main" in content | |
# assert "Title: No Main" in content | |
# assert "Some body content" in content | |
# def test_spider_parse_with_invalid_html(spider): | |
# invalid_html = "<invalid><<html>" | |
# url = "https://example.com/invalid" | |
# response = create_response(url, invalid_html) | |
# # Process should not raise an exception | |
# list(spider.parse_item(response)) | |
# # Should still create a file | |
# files = os.listdir(spider.output_dir) | |
# assert len(files) == 1 | |
# @patch('scrapy.crawler.CrawlerProcess') | |
# def test_start_crawling(mock_crawler_process_class, crawler): | |
# # Configure the mock | |
# mock_process = Mock() | |
# mock_crawler_process_class.return_value = mock_process | |
# # Start crawling | |
# crawler.start() | |
# # Verify process was created with correct settings | |
# mock_crawler_process_class.assert_called_once_with(crawler.settings) | |
# # Verify crawl method was called | |
# mock_process.crawl.assert_called_once() | |
# mock_process.start.assert_called_once() | |
def cleanup(): | |
# Setup - nothing needed | |
yield | |
# Cleanup after each test | |
if os.path.exists("test_output"): | |
for file in os.listdir("test_output"): | |
os.remove(os.path.join("test_output", file)) | |
os.rmdir("test_output") | |