|
import os |
|
import pytest |
|
from unittest.mock import Mock, patch |
|
from bs4 import BeautifulSoup |
|
from scrapy.http import Response, Request |
|
from backend.app.crawler import WebsiteSpider, DomainCrawler |
|
|
|
|
|
@pytest.fixture |
|
def sample_html(): |
|
return """ |
|
<html> |
|
<head><title>Test Page</title></head> |
|
<body> |
|
<main> |
|
<h1>Main Content</h1> |
|
<p>This is the main content.</p> |
|
</main> |
|
</body> |
|
</html> |
|
""" |
|
|
|
|
|
@pytest.fixture |
|
def output_dir(tmp_path): |
|
"""Create a temporary directory for test outputs""" |
|
return str(tmp_path / "test_crawled_content") |
|
|
|
|
|
def test_website_spider_initialization(): |
|
"""Test WebsiteSpider initialization with correct parameters""" |
|
start_url = "https://example.com" |
|
output_dir = "test_output" |
|
|
|
spider = WebsiteSpider(start_url=start_url, output_dir=output_dir) |
|
|
|
assert spider.start_urls == [start_url] |
|
assert spider.allowed_domains == ["example.com"] |
|
assert spider.output_dir == output_dir |
|
assert len(spider.rules) == 1 |
|
|
|
|
|
def test_parse_item_with_main_content(sample_html, output_dir): |
|
"""Test parsing a page with main content section""" |
|
start_url = "https://example.com" |
|
spider = WebsiteSpider(start_url=start_url, output_dir=output_dir) |
|
|
|
|
|
mock_response = Mock(spec=Response) |
|
mock_response.url = "https://example.com/test" |
|
mock_response.body = sample_html.encode("utf-8") |
|
|
|
|
|
spider.parse_item(mock_response) |
|
|
|
|
|
files = os.listdir(output_dir) |
|
assert len(files) == 1 |
|
|
|
with open(os.path.join(output_dir, files[0]), "r", encoding="utf-8") as f: |
|
content = f.read() |
|
assert "Test Page" in content |
|
assert "Main Content" in content |
|
assert "This is the main content" in content |
|
assert "URL: https://example.com/test" in content |
|
|
|
|
|
def test_parse_item_without_main_content(output_dir): |
|
"""Test parsing a page without main content section""" |
|
html_without_main = """ |
|
<html> |
|
<head><title>No Main Page</title></head> |
|
<body> |
|
<div>Some body content</div> |
|
</body> |
|
</html> |
|
""" |
|
|
|
start_url = "https://example.com" |
|
spider = WebsiteSpider(start_url=start_url, output_dir=output_dir) |
|
|
|
mock_response = Mock(spec=Response) |
|
mock_response.url = "https://example.com/no-main" |
|
mock_response.body = html_without_main.encode("utf-8") |
|
|
|
spider.parse_item(mock_response) |
|
|
|
files = os.listdir(output_dir) |
|
assert len(files) == 1 |
|
|
|
with open(os.path.join(output_dir, files[0]), "r", encoding="utf-8") as f: |
|
content = f.read() |
|
assert "No Main Page" in content |
|
assert "Some body content" in content |
|
|
|
|
|
def test_domain_crawler_initialization(): |
|
"""Test DomainCrawler initialization""" |
|
start_url = "https://example.com" |
|
output_dir = "test_output" |
|
|
|
crawler = DomainCrawler(start_url=start_url, output_dir=output_dir) |
|
|
|
assert crawler.start_url == start_url |
|
assert crawler.domain == "example.com" |
|
assert crawler.output_dir == output_dir |
|
assert crawler.settings.get("BOT_NAME") == "website_crawler" |
|
assert crawler.settings.get("ROBOTSTXT_OBEY") is True |
|
assert crawler.settings.get("CONCURRENT_REQUESTS") == 16 |
|
assert crawler.settings.get("DOWNLOAD_DELAY") == 1 |
|
|
|
|
|
@patch("backend.app.crawler.CrawlerProcess") |
|
def test_domain_crawler_start(mock_crawler_process): |
|
"""Test starting the domain crawler""" |
|
start_url = "https://example.com" |
|
output_dir = "test_output" |
|
|
|
crawler = DomainCrawler(start_url=start_url, output_dir=output_dir) |
|
crawler.start() |
|
|
|
|
|
mock_crawler_process.assert_called_once_with(crawler.settings) |
|
mock_crawler_process.return_value.crawl.assert_called_once() |
|
mock_crawler_process.return_value.start.assert_called_once() |
|
|
|
|
|
def test_output_directory_creation(): |
|
"""Test that output directory is created if it doesn't exist""" |
|
start_url = "https://example.com" |
|
output_dir = "test_output_dir" |
|
|
|
|
|
if os.path.exists(output_dir): |
|
os.rmdir(output_dir) |
|
|
|
crawler = DomainCrawler(start_url=start_url, output_dir=output_dir) |
|
assert os.path.exists(output_dir) |
|
|
|
|
|
os.rmdir(output_dir) |
|
|