File size: 4,417 Bytes
323db03
e344fab
323db03
 
e344fab
 
323db03
26b3c2a
323db03
 
 
 
 
 
 
 
e344fab
323db03
 
 
 
 
26b3c2a
323db03
e344fab
 
 
 
26b3c2a
e344fab
 
 
 
26b3c2a
e344fab
26b3c2a
e344fab
 
 
 
 
26b3c2a
e344fab
 
 
 
26b3c2a
e344fab
 
 
26b3c2a
 
e344fab
 
26b3c2a
e344fab
 
 
26b3c2a
 
e344fab
 
 
 
 
 
26b3c2a
e344fab
 
 
 
 
 
 
 
 
 
26b3c2a
e344fab
 
26b3c2a
e344fab
 
26b3c2a
 
e344fab
26b3c2a
e344fab
 
26b3c2a
 
e344fab
 
 
 
26b3c2a
e344fab
 
 
 
26b3c2a
e344fab
26b3c2a
e344fab
 
 
26b3c2a
 
 
 
 
e344fab
26b3c2a
e344fab
 
 
 
26b3c2a
e344fab
 
26b3c2a
e344fab
 
 
 
 
26b3c2a
e344fab
 
 
 
26b3c2a
e344fab
 
 
26b3c2a
e344fab
 
26b3c2a
e344fab
26b3c2a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import pytest
from unittest.mock import Mock, patch
from bs4 import BeautifulSoup
from scrapy.http import Response, Request
from backend.app.crawler import WebsiteSpider, DomainCrawler


@pytest.fixture
def sample_html():
    return """
    <html>
        <head><title>Test Page</title></head>
        <body>
            <main>
                <h1>Main Content</h1>
                <p>This is the main content.</p>
            </main>
        </body>
    </html>
    """


@pytest.fixture
def output_dir(tmp_path):
    """Create a temporary directory for test outputs"""
    return str(tmp_path / "test_crawled_content")


def test_website_spider_initialization():
    """Test WebsiteSpider initialization with correct parameters"""
    start_url = "https://example.com"
    output_dir = "test_output"

    spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)

    assert spider.start_urls == [start_url]
    assert spider.allowed_domains == ["example.com"]
    assert spider.output_dir == output_dir
    assert len(spider.rules) == 1


def test_parse_item_with_main_content(sample_html, output_dir):
    """Test parsing a page with main content section"""
    start_url = "https://example.com"
    spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)

    # Create a mock response
    mock_response = Mock(spec=Response)
    mock_response.url = "https://example.com/test"
    mock_response.body = sample_html.encode("utf-8")

    # Process the mock response
    spider.parse_item(mock_response)

    # Check if file was created and contains correct content
    files = os.listdir(output_dir)
    assert len(files) == 1

    with open(os.path.join(output_dir, files[0]), "r", encoding="utf-8") as f:
        content = f.read()
        assert "Test Page" in content
        assert "Main Content" in content
        assert "This is the main content" in content
        assert "URL: https://example.com/test" in content


def test_parse_item_without_main_content(output_dir):
    """Test parsing a page without main content section"""
    html_without_main = """
    <html>
        <head><title>No Main Page</title></head>
        <body>
            <div>Some body content</div>
        </body>
    </html>
    """

    start_url = "https://example.com"
    spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)

    mock_response = Mock(spec=Response)
    mock_response.url = "https://example.com/no-main"
    mock_response.body = html_without_main.encode("utf-8")

    spider.parse_item(mock_response)

    files = os.listdir(output_dir)
    assert len(files) == 1

    with open(os.path.join(output_dir, files[0]), "r", encoding="utf-8") as f:
        content = f.read()
        assert "No Main Page" in content
        assert "Some body content" in content


def test_domain_crawler_initialization():
    """Test DomainCrawler initialization"""
    start_url = "https://example.com"
    output_dir = "test_output"

    crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)

    assert crawler.start_url == start_url
    assert crawler.domain == "example.com"
    assert crawler.output_dir == output_dir
    assert crawler.settings.get("BOT_NAME") == "website_crawler"
    assert crawler.settings.get("ROBOTSTXT_OBEY") is True
    assert crawler.settings.get("CONCURRENT_REQUESTS") == 16
    assert crawler.settings.get("DOWNLOAD_DELAY") == 1


@patch("backend.app.crawler.CrawlerProcess")
def test_domain_crawler_start(mock_crawler_process):
    """Test starting the domain crawler"""
    start_url = "https://example.com"
    output_dir = "test_output"

    crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
    crawler.start()

    # Verify that CrawlerProcess was instantiated and crawl was started
    mock_crawler_process.assert_called_once_with(crawler.settings)
    mock_crawler_process.return_value.crawl.assert_called_once()
    mock_crawler_process.return_value.start.assert_called_once()


def test_output_directory_creation():
    """Test that output directory is created if it doesn't exist"""
    start_url = "https://example.com"
    output_dir = "test_output_dir"

    # Ensure directory doesn't exist
    if os.path.exists(output_dir):
        os.rmdir(output_dir)

    crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
    assert os.path.exists(output_dir)

    # Cleanup
    os.rmdir(output_dir)