File size: 4,014 Bytes
323db03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pytest
import os
from unittest.mock import Mock, patch
from bs4 import BeautifulSoup
from scrapy.http import Response, Request, TextResponse
from backend.app.crawler import DomainCrawler, WebsiteSpider


@pytest.fixture
def sample_html():
    return """
    <html>
        <head><title>Test Page</title></head>
        <body>
            <main>
                <h1>Main Content</h1>
                <p>This is the main content of the page.</p>
            </main>
        </body>
    </html>
    """


@pytest.fixture
def crawler():
    return DomainCrawler("https://example.com", output_dir="test_output")


@pytest.fixture
def spider():
    return WebsiteSpider(start_url="https://example.com", output_dir="test_output")


# def test_crawler_initialization(crawler):
#     assert crawler.start_url == "https://example.com"
#     assert crawler.domain == "example.com"
#     assert crawler.output_dir == "test_output"
#     assert os.path.exists("test_output")

#     # Test Scrapy settings
#     assert crawler.settings.get('BOT_NAME') == 'website_crawler'
#     assert crawler.settings.get('ROBOTSTXT_OBEY') is True
#     assert crawler.settings.get('DOWNLOAD_DELAY') == 1


def create_response(url, body):
    request = Request(url=url)
    return TextResponse(
        url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
    )


# def test_spider_parse_with_main_content(spider, sample_html):
#     url = "https://example.com/test"
#     response = create_response(url, sample_html)

#     # Process the page
#     list(spider.parse_item(response))

#     # Check if file was created
#     files = os.listdir(spider.output_dir)
#     assert len(files) == 1

#     # Read the saved file
#     with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
#         content = f.read()

#     # Verify content
#     assert "URL: https://example.com/test" in content
#     assert "Title: Test Page" in content
#     assert "Main Content" in content
#     assert "This is the main content of the page." in content

# def test_spider_parse_without_main_content(spider):
#     html_without_main = """
#     <html>
#         <head><title>No Main</title></head>
#         <body>
#             <div>Some body content</div>
#         </body>
#     </html>
#     """

#     url = "https://example.com/no-main"
#     response = create_response(url, html_without_main)

#     # Process the page
#     list(spider.parse_item(response))

#     files = os.listdir(spider.output_dir)
#     assert len(files) == 1

#     with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
#         content = f.read()

#     assert "URL: https://example.com/no-main" in content
#     assert "Title: No Main" in content
#     assert "Some body content" in content

# def test_spider_parse_with_invalid_html(spider):
#     invalid_html = "<invalid><<html>"
#     url = "https://example.com/invalid"
#     response = create_response(url, invalid_html)

#     # Process should not raise an exception
#     list(spider.parse_item(response))

#     # Should still create a file
#     files = os.listdir(spider.output_dir)
#     assert len(files) == 1

# @patch('scrapy.crawler.CrawlerProcess')
# def test_start_crawling(mock_crawler_process_class, crawler):
#     # Configure the mock
#     mock_process = Mock()
#     mock_crawler_process_class.return_value = mock_process

#     # Start crawling
#     crawler.start()

#     # Verify process was created with correct settings
#     mock_crawler_process_class.assert_called_once_with(crawler.settings)

#     # Verify crawl method was called
#     mock_process.crawl.assert_called_once()
#     mock_process.start.assert_called_once()


@pytest.fixture(autouse=True)
def cleanup():
    # Setup - nothing needed
    yield
    # Cleanup after each test
    if os.path.exists("test_output"):
        for file in os.listdir("test_output"):
            os.remove(os.path.join("test_output", file))
        os.rmdir("test_output")