File size: 4,014 Bytes
323db03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import pytest
import os
from unittest.mock import Mock, patch
from bs4 import BeautifulSoup
from scrapy.http import Response, Request, TextResponse
from backend.app.crawler import DomainCrawler, WebsiteSpider
@pytest.fixture
def sample_html():
return """
<html>
<head><title>Test Page</title></head>
<body>
<main>
<h1>Main Content</h1>
<p>This is the main content of the page.</p>
</main>
</body>
</html>
"""
@pytest.fixture
def crawler():
return DomainCrawler("https://example.com", output_dir="test_output")
@pytest.fixture
def spider():
return WebsiteSpider(start_url="https://example.com", output_dir="test_output")
# def test_crawler_initialization(crawler):
# assert crawler.start_url == "https://example.com"
# assert crawler.domain == "example.com"
# assert crawler.output_dir == "test_output"
# assert os.path.exists("test_output")
# # Test Scrapy settings
# assert crawler.settings.get('BOT_NAME') == 'website_crawler'
# assert crawler.settings.get('ROBOTSTXT_OBEY') is True
# assert crawler.settings.get('DOWNLOAD_DELAY') == 1
def create_response(url, body):
request = Request(url=url)
return TextResponse(
url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
)
# def test_spider_parse_with_main_content(spider, sample_html):
# url = "https://example.com/test"
# response = create_response(url, sample_html)
# # Process the page
# list(spider.parse_item(response))
# # Check if file was created
# files = os.listdir(spider.output_dir)
# assert len(files) == 1
# # Read the saved file
# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
# content = f.read()
# # Verify content
# assert "URL: https://example.com/test" in content
# assert "Title: Test Page" in content
# assert "Main Content" in content
# assert "This is the main content of the page." in content
# def test_spider_parse_without_main_content(spider):
# html_without_main = """
# <html>
# <head><title>No Main</title></head>
# <body>
# <div>Some body content</div>
# </body>
# </html>
# """
# url = "https://example.com/no-main"
# response = create_response(url, html_without_main)
# # Process the page
# list(spider.parse_item(response))
# files = os.listdir(spider.output_dir)
# assert len(files) == 1
# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
# content = f.read()
# assert "URL: https://example.com/no-main" in content
# assert "Title: No Main" in content
# assert "Some body content" in content
# def test_spider_parse_with_invalid_html(spider):
# invalid_html = "<invalid><<html>"
# url = "https://example.com/invalid"
# response = create_response(url, invalid_html)
# # Process should not raise an exception
# list(spider.parse_item(response))
# # Should still create a file
# files = os.listdir(spider.output_dir)
# assert len(files) == 1
# @patch('scrapy.crawler.CrawlerProcess')
# def test_start_crawling(mock_crawler_process_class, crawler):
# # Configure the mock
# mock_process = Mock()
# mock_crawler_process_class.return_value = mock_process
# # Start crawling
# crawler.start()
# # Verify process was created with correct settings
# mock_crawler_process_class.assert_called_once_with(crawler.settings)
# # Verify crawl method was called
# mock_process.crawl.assert_called_once()
# mock_process.start.assert_called_once()
@pytest.fixture(autouse=True)
def cleanup():
# Setup - nothing needed
yield
# Cleanup after each test
if os.path.exists("test_output"):
for file in os.listdir("test_output"):
os.remove(os.path.join("test_output", file))
os.rmdir("test_output")
|