Added outlines of crawler code
Browse files- backend/app/crawler.py +57 -36
- backend/app/main.py +1 -1
- backend/tests/test_crawler.py +137 -0
backend/app/crawler.py
CHANGED
@@ -1,44 +1,39 @@
|
|
1 |
-
from spidy import crawler
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
|
|
|
|
|
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
-
from urllib.parse import
|
7 |
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
10 |
|
11 |
-
class
|
12 |
-
|
13 |
-
self.start_url = start_url
|
14 |
-
self.domain = urlparse(start_url).netloc
|
15 |
-
self.output_dir = output_dir
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
#
|
23 |
-
self.
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
save_path=output_dir,
|
30 |
-
restrict_domain=True,
|
31 |
-
verbose=True,
|
32 |
)
|
33 |
|
34 |
-
|
35 |
-
self.crawler.page_handler = self.process_page
|
36 |
|
37 |
-
def
|
38 |
-
"""Custom page processor that extracts and saves content"""
|
39 |
try:
|
40 |
-
# Parse the HTML
|
41 |
-
soup = BeautifulSoup(
|
42 |
|
43 |
# Extract the title
|
44 |
title = soup.title.string if soup.title else "No Title"
|
@@ -47,7 +42,7 @@ class DomainCrawler:
|
|
47 |
filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
|
48 |
filepath = os.path.join(self.output_dir, filename)
|
49 |
|
50 |
-
# Extract main content
|
51 |
main_content = (
|
52 |
soup.find("main")
|
53 |
or soup.find("article")
|
@@ -65,28 +60,54 @@ class DomainCrawler:
|
|
65 |
else "No content"
|
66 |
)
|
67 |
logger.warning(
|
68 |
-
f"No main content found for {url}, falling back to body text"
|
69 |
)
|
70 |
|
71 |
# Save the extracted content
|
72 |
with open(filepath, "w", encoding="utf-8") as f:
|
73 |
-
f.write(f"URL: {url}\n")
|
74 |
f.write(f"Title: {title}\n\n")
|
75 |
f.write(text_content)
|
76 |
|
77 |
-
logger.info(f"Saved content from {url} to {filepath}")
|
78 |
|
79 |
except Exception as e:
|
80 |
-
logger.error(f"Error processing {url}: {e}", exc_info=True)
|
81 |
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
def start(self):
|
85 |
"""Start the crawling process"""
|
86 |
logger.info(f"Starting crawl from {self.start_url}")
|
87 |
-
self.crawler.crawl()
|
88 |
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
90 |
logger.info("\nCrawl completed!")
|
91 |
-
logger.info(f"Pages crawled: {len(self.crawler.links_crawled)}")
|
92 |
logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")
|
|
|
|
|
1 |
import os
|
2 |
import re
|
3 |
import logging
|
4 |
+
from scrapy.spiders import CrawlSpider, Rule
|
5 |
+
from scrapy.linkextractors import LinkExtractor
|
6 |
+
from scrapy.crawler import CrawlerProcess
|
7 |
+
from scrapy.utils.project import get_project_settings
|
8 |
from bs4 import BeautifulSoup
|
9 |
+
from urllib.parse import urlparse
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
13 |
|
14 |
+
class WebsiteSpider(CrawlSpider):
|
15 |
+
name = "website_spider"
|
|
|
|
|
|
|
16 |
|
17 |
+
def __init__(self, start_url, output_dir, *args, **kwargs):
|
18 |
+
self.start_urls = [start_url]
|
19 |
+
self.allowed_domains = [urlparse(start_url).netloc]
|
20 |
+
self.output_dir = output_dir
|
21 |
|
22 |
+
# Define rules for link extraction
|
23 |
+
self.rules = (
|
24 |
+
Rule(
|
25 |
+
LinkExtractor(allow_domains=self.allowed_domains),
|
26 |
+
callback="parse_item",
|
27 |
+
follow=True,
|
28 |
+
),
|
|
|
|
|
|
|
29 |
)
|
30 |
|
31 |
+
super().__init__(*args, **kwargs)
|
|
|
32 |
|
33 |
+
def parse_item(self, response):
|
|
|
34 |
try:
|
35 |
+
# Parse the HTML with BeautifulSoup
|
36 |
+
soup = BeautifulSoup(response.body, "html.parser")
|
37 |
|
38 |
# Extract the title
|
39 |
title = soup.title.string if soup.title else "No Title"
|
|
|
42 |
filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
|
43 |
filepath = os.path.join(self.output_dir, filename)
|
44 |
|
45 |
+
# Extract main content
|
46 |
main_content = (
|
47 |
soup.find("main")
|
48 |
or soup.find("article")
|
|
|
60 |
else "No content"
|
61 |
)
|
62 |
logger.warning(
|
63 |
+
f"No main content found for {response.url}, falling back to body text"
|
64 |
)
|
65 |
|
66 |
# Save the extracted content
|
67 |
with open(filepath, "w", encoding="utf-8") as f:
|
68 |
+
f.write(f"URL: {response.url}\n")
|
69 |
f.write(f"Title: {title}\n\n")
|
70 |
f.write(text_content)
|
71 |
|
72 |
+
logger.info(f"Saved content from {response.url} to {filepath}")
|
73 |
|
74 |
except Exception as e:
|
75 |
+
logger.error(f"Error processing {response.url}: {e}", exc_info=True)
|
76 |
|
77 |
+
|
78 |
+
class DomainCrawler:
|
79 |
+
def __init__(self, start_url, output_dir="crawled_content"):
|
80 |
+
self.start_url = start_url
|
81 |
+
self.domain = urlparse(start_url).netloc
|
82 |
+
self.output_dir = output_dir
|
83 |
+
|
84 |
+
# Create output directory if it doesn't exist
|
85 |
+
if not os.path.exists(output_dir):
|
86 |
+
os.makedirs(output_dir)
|
87 |
+
logger.info(f"Created output directory: {output_dir}")
|
88 |
+
|
89 |
+
# Configure Scrapy settings
|
90 |
+
self.settings = get_project_settings()
|
91 |
+
self.settings.update(
|
92 |
+
{
|
93 |
+
"BOT_NAME": "website_crawler",
|
94 |
+
"ROBOTSTXT_OBEY": True,
|
95 |
+
"CONCURRENT_REQUESTS": 16,
|
96 |
+
"DOWNLOAD_DELAY": 1,
|
97 |
+
"COOKIES_ENABLED": False,
|
98 |
+
"USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)",
|
99 |
+
}
|
100 |
+
)
|
101 |
|
102 |
def start(self):
|
103 |
"""Start the crawling process"""
|
104 |
logger.info(f"Starting crawl from {self.start_url}")
|
|
|
105 |
|
106 |
+
process = CrawlerProcess(self.settings)
|
107 |
+
process.crawl(
|
108 |
+
WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir
|
109 |
+
)
|
110 |
+
process.start()
|
111 |
+
|
112 |
logger.info("\nCrawl completed!")
|
|
|
113 |
logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")
|
backend/app/main.py
CHANGED
@@ -9,7 +9,7 @@ from typing import Dict, List
|
|
9 |
import asyncio
|
10 |
import logging
|
11 |
import os
|
12 |
-
from crawler import DomainCrawler
|
13 |
|
14 |
app = FastAPI()
|
15 |
|
|
|
9 |
import asyncio
|
10 |
import logging
|
11 |
import os
|
12 |
+
from backend.app.crawler import DomainCrawler
|
13 |
|
14 |
app = FastAPI()
|
15 |
|
backend/tests/test_crawler.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
import os
|
3 |
+
from unittest.mock import Mock, patch
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from scrapy.http import Response, Request, TextResponse
|
6 |
+
from backend.app.crawler import DomainCrawler, WebsiteSpider
|
7 |
+
|
8 |
+
|
9 |
+
@pytest.fixture
|
10 |
+
def sample_html():
|
11 |
+
return """
|
12 |
+
<html>
|
13 |
+
<head><title>Test Page</title></head>
|
14 |
+
<body>
|
15 |
+
<main>
|
16 |
+
<h1>Main Content</h1>
|
17 |
+
<p>This is the main content of the page.</p>
|
18 |
+
</main>
|
19 |
+
</body>
|
20 |
+
</html>
|
21 |
+
"""
|
22 |
+
|
23 |
+
|
24 |
+
@pytest.fixture
|
25 |
+
def crawler():
|
26 |
+
return DomainCrawler("https://example.com", output_dir="test_output")
|
27 |
+
|
28 |
+
|
29 |
+
@pytest.fixture
|
30 |
+
def spider():
|
31 |
+
return WebsiteSpider(start_url="https://example.com", output_dir="test_output")
|
32 |
+
|
33 |
+
|
34 |
+
# def test_crawler_initialization(crawler):
|
35 |
+
# assert crawler.start_url == "https://example.com"
|
36 |
+
# assert crawler.domain == "example.com"
|
37 |
+
# assert crawler.output_dir == "test_output"
|
38 |
+
# assert os.path.exists("test_output")
|
39 |
+
|
40 |
+
# # Test Scrapy settings
|
41 |
+
# assert crawler.settings.get('BOT_NAME') == 'website_crawler'
|
42 |
+
# assert crawler.settings.get('ROBOTSTXT_OBEY') is True
|
43 |
+
# assert crawler.settings.get('DOWNLOAD_DELAY') == 1
|
44 |
+
|
45 |
+
|
46 |
+
def create_response(url, body):
|
47 |
+
request = Request(url=url)
|
48 |
+
return TextResponse(
|
49 |
+
url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
# def test_spider_parse_with_main_content(spider, sample_html):
|
54 |
+
# url = "https://example.com/test"
|
55 |
+
# response = create_response(url, sample_html)
|
56 |
+
|
57 |
+
# # Process the page
|
58 |
+
# list(spider.parse_item(response))
|
59 |
+
|
60 |
+
# # Check if file was created
|
61 |
+
# files = os.listdir(spider.output_dir)
|
62 |
+
# assert len(files) == 1
|
63 |
+
|
64 |
+
# # Read the saved file
|
65 |
+
# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
|
66 |
+
# content = f.read()
|
67 |
+
|
68 |
+
# # Verify content
|
69 |
+
# assert "URL: https://example.com/test" in content
|
70 |
+
# assert "Title: Test Page" in content
|
71 |
+
# assert "Main Content" in content
|
72 |
+
# assert "This is the main content of the page." in content
|
73 |
+
|
74 |
+
# def test_spider_parse_without_main_content(spider):
|
75 |
+
# html_without_main = """
|
76 |
+
# <html>
|
77 |
+
# <head><title>No Main</title></head>
|
78 |
+
# <body>
|
79 |
+
# <div>Some body content</div>
|
80 |
+
# </body>
|
81 |
+
# </html>
|
82 |
+
# """
|
83 |
+
|
84 |
+
# url = "https://example.com/no-main"
|
85 |
+
# response = create_response(url, html_without_main)
|
86 |
+
|
87 |
+
# # Process the page
|
88 |
+
# list(spider.parse_item(response))
|
89 |
+
|
90 |
+
# files = os.listdir(spider.output_dir)
|
91 |
+
# assert len(files) == 1
|
92 |
+
|
93 |
+
# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
|
94 |
+
# content = f.read()
|
95 |
+
|
96 |
+
# assert "URL: https://example.com/no-main" in content
|
97 |
+
# assert "Title: No Main" in content
|
98 |
+
# assert "Some body content" in content
|
99 |
+
|
100 |
+
# def test_spider_parse_with_invalid_html(spider):
|
101 |
+
# invalid_html = "<invalid><<html>"
|
102 |
+
# url = "https://example.com/invalid"
|
103 |
+
# response = create_response(url, invalid_html)
|
104 |
+
|
105 |
+
# # Process should not raise an exception
|
106 |
+
# list(spider.parse_item(response))
|
107 |
+
|
108 |
+
# # Should still create a file
|
109 |
+
# files = os.listdir(spider.output_dir)
|
110 |
+
# assert len(files) == 1
|
111 |
+
|
112 |
+
# @patch('scrapy.crawler.CrawlerProcess')
|
113 |
+
# def test_start_crawling(mock_crawler_process_class, crawler):
|
114 |
+
# # Configure the mock
|
115 |
+
# mock_process = Mock()
|
116 |
+
# mock_crawler_process_class.return_value = mock_process
|
117 |
+
|
118 |
+
# # Start crawling
|
119 |
+
# crawler.start()
|
120 |
+
|
121 |
+
# # Verify process was created with correct settings
|
122 |
+
# mock_crawler_process_class.assert_called_once_with(crawler.settings)
|
123 |
+
|
124 |
+
# # Verify crawl method was called
|
125 |
+
# mock_process.crawl.assert_called_once()
|
126 |
+
# mock_process.start.assert_called_once()
|
127 |
+
|
128 |
+
|
129 |
+
@pytest.fixture(autouse=True)
|
130 |
+
def cleanup():
|
131 |
+
# Setup - nothing needed
|
132 |
+
yield
|
133 |
+
# Cleanup after each test
|
134 |
+
if os.path.exists("test_output"):
|
135 |
+
for file in os.listdir("test_output"):
|
136 |
+
os.remove(os.path.join("test_output", file))
|
137 |
+
os.rmdir("test_output")
|