Rsr2425 commited on
Commit
e344fab
·
1 Parent(s): 9fc8e5c

Fixed crawler code and added tests

Browse files
backend/app/crawler.py CHANGED
@@ -8,18 +8,42 @@ from scrapy.utils.project import get_project_settings
8
  from bs4 import BeautifulSoup
9
  from urllib.parse import urlparse
10
 
 
 
 
 
 
 
11
  logger = logging.getLogger(__name__)
12
 
13
 
14
  class WebsiteSpider(CrawlSpider):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  name = "website_spider"
16
 
17
  def __init__(self, start_url, output_dir, *args, **kwargs):
 
 
18
  self.start_urls = [start_url]
19
  self.allowed_domains = [urlparse(start_url).netloc]
20
  self.output_dir = output_dir
21
 
22
- # Define rules for link extraction
 
 
23
  self.rules = (
24
  Rule(
25
  LinkExtractor(allow_domains=self.allowed_domains),
@@ -28,9 +52,17 @@ class WebsiteSpider(CrawlSpider):
28
  ),
29
  )
30
 
31
- super().__init__(*args, **kwargs)
32
 
33
  def parse_item(self, response):
 
 
 
 
 
 
 
 
 
34
  try:
35
  # Parse the HTML with BeautifulSoup
36
  soup = BeautifulSoup(response.body, "html.parser")
@@ -76,15 +108,30 @@ class WebsiteSpider(CrawlSpider):
76
 
77
 
78
  class DomainCrawler:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def __init__(self, start_url, output_dir="crawled_content"):
80
  self.start_url = start_url
81
  self.domain = urlparse(start_url).netloc
82
  self.output_dir = output_dir
83
 
84
- # Create output directory if it doesn't exist
85
- if not os.path.exists(output_dir):
86
- os.makedirs(output_dir)
87
- logger.info(f"Created output directory: {output_dir}")
88
 
89
  # Configure Scrapy settings
90
  self.settings = get_project_settings()
@@ -100,7 +147,12 @@ class DomainCrawler:
100
  )
101
 
102
  def start(self):
103
- """Start the crawling process"""
 
 
 
 
 
104
  logger.info(f"Starting crawl from {self.start_url}")
105
 
106
  process = CrawlerProcess(self.settings)
 
8
  from bs4 import BeautifulSoup
9
  from urllib.parse import urlparse
10
 
11
+ """
12
+ A web crawler module for extracting content from documentation websites.
13
+ This module provides classes for crawling a domain and extracting main content
14
+ from web pages, saving the results as text files.
15
+ """
16
+
17
  logger = logging.getLogger(__name__)
18
 
19
 
20
  class WebsiteSpider(CrawlSpider):
21
+ """
22
+ A Scrapy spider for crawling documentation websites and extracting content.
23
+
24
+ This spider follows links within the allowed domain and extracts the main content
25
+ from each page, saving it to a text file. It attempts to find content within <main>,
26
+ <article> or content div tags, falling back to the full body if none are found.
27
+
28
+ Args:
29
+ start_url (str): The URL where crawling should begin
30
+ output_dir (str): Directory where extracted content should be saved
31
+ *args: Additional positional arguments passed to CrawlSpider
32
+ **kwargs: Additional keyword arguments passed to CrawlSpider
33
+ """
34
+
35
  name = "website_spider"
36
 
37
  def __init__(self, start_url, output_dir, *args, **kwargs):
38
+ super().__init__(*args, **kwargs)
39
+
40
  self.start_urls = [start_url]
41
  self.allowed_domains = [urlparse(start_url).netloc]
42
  self.output_dir = output_dir
43
 
44
+ os.makedirs(output_dir, exist_ok=True)
45
+ logger.info(f"Created output directory: {output_dir}")
46
+
47
  self.rules = (
48
  Rule(
49
  LinkExtractor(allow_domains=self.allowed_domains),
 
52
  ),
53
  )
54
 
 
55
 
56
  def parse_item(self, response):
57
+ """
58
+ Parse a webpage and extract its content.
59
+
60
+ Args:
61
+ response: The Scrapy response object containing the webpage
62
+
63
+ The extracted content is saved to a text file in the output directory,
64
+ including the page URL, title and main content.
65
+ """
66
  try:
67
  # Parse the HTML with BeautifulSoup
68
  soup = BeautifulSoup(response.body, "html.parser")
 
108
 
109
 
110
  class DomainCrawler:
111
+ """
112
+ High-level crawler class for extracting content from a documentation website.
113
+
114
+ This class provides a simple interface for crawling a website and extracting its
115
+ content. It configures and runs a Scrapy crawler with sensible defaults for
116
+ crawling documentation sites.
117
+
118
+ Example:
119
+ crawler = DomainCrawler("https://docs.example.com")
120
+ crawler.start() # Crawls the site and saves content to ./crawled_content/
121
+
122
+ Args:
123
+ start_url (str): The URL where crawling should begin
124
+ output_dir (str, optional): Directory where extracted content should be saved.
125
+ Defaults to "crawled_content"
126
+ """
127
+
128
  def __init__(self, start_url, output_dir="crawled_content"):
129
  self.start_url = start_url
130
  self.domain = urlparse(start_url).netloc
131
  self.output_dir = output_dir
132
 
133
+ os.makedirs(output_dir, exist_ok=True)
134
+ logger.info(f"Created output directory: {output_dir}")
 
 
135
 
136
  # Configure Scrapy settings
137
  self.settings = get_project_settings()
 
147
  )
148
 
149
  def start(self):
150
+ """
151
+ Start the crawling process.
152
+
153
+ This method initiates the crawler and blocks until crawling is complete.
154
+ The extracted content will be saved to the configured output directory.
155
+ """
156
  logger.info(f"Starting crawl from {self.start_url}")
157
 
158
  process = CrawlerProcess(self.settings)
backend/app/main.py CHANGED
@@ -11,6 +11,7 @@ import logging
11
  import os
12
  from backend.app.crawler import DomainCrawler
13
  from backend.app.vectorstore import get_all_unique_source_of_docs_in_collection_DUMB
 
14
 
15
  app = FastAPI()
16
 
@@ -23,6 +24,20 @@ app.add_middleware(
23
  )
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class UrlInput(BaseModel):
27
  url: str
28
 
@@ -46,10 +61,11 @@ class TopicsResponse(BaseModel):
46
  sources: List[str]
47
 
48
 
49
- @app.post("/api/ingest/")
50
- async def ingest_documentation(input_data: UrlInput):
 
51
  print(f"Received url {input_data.url}")
52
- return {"status": "received"}
53
 
54
 
55
  @app.post("/api/problems/")
 
11
  import os
12
  from backend.app.crawler import DomainCrawler
13
  from backend.app.vectorstore import get_all_unique_source_of_docs_in_collection_DUMB
14
+ from enum import Enum
15
 
16
  app = FastAPI()
17
 
 
24
  )
25
 
26
 
27
+ class IngestStatus(Enum):
28
+ RECEIVED = "RECEIVED"
29
+ FAILURE = "FAILURE"
30
+
31
+
32
+ class IngestRequest(BaseModel):
33
+ topic: str
34
+ url: str
35
+
36
+
37
+ class IngestResponse(BaseModel):
38
+ status: IngestStatus
39
+
40
+
41
  class UrlInput(BaseModel):
42
  url: str
43
 
 
61
  sources: List[str]
62
 
63
 
64
+ # TODO maybe call this /api/scan/ just to be consistent and match FE?
65
+ @app.post("/api/ingest/", response_model=IngestResponse)
66
+ async def ingest_documentation(input_data: IngestRequest):
67
  print(f"Received url {input_data.url}")
68
+ return IngestResponse(status=IngestStatus.RECEIVED)
69
 
70
 
71
  @app.post("/api/problems/")
backend/tests/test_api.py CHANGED
@@ -1,6 +1,5 @@
1
  from fastapi.testclient import TestClient
2
  from backend.app.main import app
3
- import pytest
4
 
5
  client = TestClient(app)
6
 
@@ -8,7 +7,7 @@ client = TestClient(app)
8
  def test_crawl_endpoint():
9
  response = client.post("/api/ingest/", json={"url": "https://example.com"})
10
  assert response.status_code == 200
11
- assert response.json() == {"status": "received"}
12
 
13
 
14
  def test_problems_endpoint():
 
1
  from fastapi.testclient import TestClient
2
  from backend.app.main import app
 
3
 
4
  client = TestClient(app)
5
 
 
7
  def test_crawl_endpoint():
8
  response = client.post("/api/ingest/", json={"url": "https://example.com"})
9
  assert response.status_code == 200
10
+ assert response.json() == {"status": "RECEIVED"}
11
 
12
 
13
  def test_problems_endpoint():
backend/tests/test_crawler.py CHANGED
@@ -1,10 +1,9 @@
1
- import pytest
2
  import os
 
3
  from unittest.mock import Mock, patch
4
  from bs4 import BeautifulSoup
5
- from scrapy.http import Response, Request, TextResponse
6
- from backend.app.crawler import DomainCrawler, WebsiteSpider
7
-
8
 
9
  @pytest.fixture
10
  def sample_html():
@@ -14,124 +13,121 @@ def sample_html():
14
  <body>
15
  <main>
16
  <h1>Main Content</h1>
17
- <p>This is the main content of the page.</p>
18
  </main>
19
  </body>
20
  </html>
21
  """
22
 
23
-
24
- @pytest.fixture
25
- def crawler():
26
- return DomainCrawler("https://example.com", output_dir="test_output")
27
-
28
-
29
  @pytest.fixture
30
- def spider():
31
- return WebsiteSpider(start_url="https://example.com", output_dir="test_output")
32
-
33
-
34
- # def test_crawler_initialization(crawler):
35
- # assert crawler.start_url == "https://example.com"
36
- # assert crawler.domain == "example.com"
37
- # assert crawler.output_dir == "test_output"
38
- # assert os.path.exists("test_output")
39
-
40
- # # Test Scrapy settings
41
- # assert crawler.settings.get('BOT_NAME') == 'website_crawler'
42
- # assert crawler.settings.get('ROBOTSTXT_OBEY') is True
43
- # assert crawler.settings.get('DOWNLOAD_DELAY') == 1
44
-
45
-
46
- def create_response(url, body):
47
- request = Request(url=url)
48
- return TextResponse(
49
- url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
50
- )
51
-
52
-
53
- # def test_spider_parse_with_main_content(spider, sample_html):
54
- # url = "https://example.com/test"
55
- # response = create_response(url, sample_html)
56
-
57
- # # Process the page
58
- # list(spider.parse_item(response))
59
-
60
- # # Check if file was created
61
- # files = os.listdir(spider.output_dir)
62
- # assert len(files) == 1
63
-
64
- # # Read the saved file
65
- # with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
66
- # content = f.read()
67
-
68
- # # Verify content
69
- # assert "URL: https://example.com/test" in content
70
- # assert "Title: Test Page" in content
71
- # assert "Main Content" in content
72
- # assert "This is the main content of the page." in content
73
-
74
- # def test_spider_parse_without_main_content(spider):
75
- # html_without_main = """
76
- # <html>
77
- # <head><title>No Main</title></head>
78
- # <body>
79
- # <div>Some body content</div>
80
- # </body>
81
- # </html>
82
- # """
83
-
84
- # url = "https://example.com/no-main"
85
- # response = create_response(url, html_without_main)
86
-
87
- # # Process the page
88
- # list(spider.parse_item(response))
89
-
90
- # files = os.listdir(spider.output_dir)
91
- # assert len(files) == 1
92
-
93
- # with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
94
- # content = f.read()
95
-
96
- # assert "URL: https://example.com/no-main" in content
97
- # assert "Title: No Main" in content
98
- # assert "Some body content" in content
99
-
100
- # def test_spider_parse_with_invalid_html(spider):
101
- # invalid_html = "<invalid><<html>"
102
- # url = "https://example.com/invalid"
103
- # response = create_response(url, invalid_html)
104
-
105
- # # Process should not raise an exception
106
- # list(spider.parse_item(response))
107
-
108
- # # Should still create a file
109
- # files = os.listdir(spider.output_dir)
110
- # assert len(files) == 1
111
-
112
- # @patch('scrapy.crawler.CrawlerProcess')
113
- # def test_start_crawling(mock_crawler_process_class, crawler):
114
- # # Configure the mock
115
- # mock_process = Mock()
116
- # mock_crawler_process_class.return_value = mock_process
117
-
118
- # # Start crawling
119
- # crawler.start()
120
-
121
- # # Verify process was created with correct settings
122
- # mock_crawler_process_class.assert_called_once_with(crawler.settings)
123
-
124
- # # Verify crawl method was called
125
- # mock_process.crawl.assert_called_once()
126
- # mock_process.start.assert_called_once()
127
-
128
-
129
- @pytest.fixture(autouse=True)
130
- def cleanup():
131
- # Setup - nothing needed
132
- yield
133
- # Cleanup after each test
134
- if os.path.exists("test_output"):
135
- for file in os.listdir("test_output"):
136
- os.remove(os.path.join("test_output", file))
137
- os.rmdir("test_output")
 
 
 
 
 
1
  import os
2
+ import pytest
3
  from unittest.mock import Mock, patch
4
  from bs4 import BeautifulSoup
5
+ from scrapy.http import Response, Request
6
+ from backend.app.crawler import WebsiteSpider, DomainCrawler
 
7
 
8
  @pytest.fixture
9
  def sample_html():
 
13
  <body>
14
  <main>
15
  <h1>Main Content</h1>
16
+ <p>This is the main content.</p>
17
  </main>
18
  </body>
19
  </html>
20
  """
21
 
 
 
 
 
 
 
22
  @pytest.fixture
23
+ def output_dir(tmp_path):
24
+ """Create a temporary directory for test outputs"""
25
+ return str(tmp_path / "test_crawled_content")
26
+
27
+ def test_website_spider_initialization():
28
+ """Test WebsiteSpider initialization with correct parameters"""
29
+ start_url = "https://example.com"
30
+ output_dir = "test_output"
31
+
32
+ spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
33
+
34
+ assert spider.start_urls == [start_url]
35
+ assert spider.allowed_domains == ["example.com"]
36
+ assert spider.output_dir == output_dir
37
+ assert len(spider.rules) == 1
38
+
39
+ def test_parse_item_with_main_content(sample_html, output_dir):
40
+ """Test parsing a page with main content section"""
41
+ start_url = "https://example.com"
42
+ spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
43
+
44
+ # Create a mock response
45
+ mock_response = Mock(spec=Response)
46
+ mock_response.url = "https://example.com/test"
47
+ mock_response.body = sample_html.encode('utf-8')
48
+
49
+ # Process the mock response
50
+ spider.parse_item(mock_response)
51
+
52
+ # Check if file was created and contains correct content
53
+ files = os.listdir(output_dir)
54
+ assert len(files) == 1
55
+
56
+ with open(os.path.join(output_dir, files[0]), 'r', encoding='utf-8') as f:
57
+ content = f.read()
58
+ assert "Test Page" in content
59
+ assert "Main Content" in content
60
+ assert "This is the main content" in content
61
+ assert "URL: https://example.com/test" in content
62
+
63
+ def test_parse_item_without_main_content(output_dir):
64
+ """Test parsing a page without main content section"""
65
+ html_without_main = """
66
+ <html>
67
+ <head><title>No Main Page</title></head>
68
+ <body>
69
+ <div>Some body content</div>
70
+ </body>
71
+ </html>
72
+ """
73
+
74
+ start_url = "https://example.com"
75
+ spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
76
+
77
+ mock_response = Mock(spec=Response)
78
+ mock_response.url = "https://example.com/no-main"
79
+ mock_response.body = html_without_main.encode('utf-8')
80
+
81
+ spider.parse_item(mock_response)
82
+
83
+ files = os.listdir(output_dir)
84
+ assert len(files) == 1
85
+
86
+ with open(os.path.join(output_dir, files[0]), 'r', encoding='utf-8') as f:
87
+ content = f.read()
88
+ assert "No Main Page" in content
89
+ assert "Some body content" in content
90
+
91
+ def test_domain_crawler_initialization():
92
+ """Test DomainCrawler initialization"""
93
+ start_url = "https://example.com"
94
+ output_dir = "test_output"
95
+
96
+ crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
97
+
98
+ assert crawler.start_url == start_url
99
+ assert crawler.domain == "example.com"
100
+ assert crawler.output_dir == output_dir
101
+ assert crawler.settings.get('BOT_NAME') == "website_crawler"
102
+ assert crawler.settings.get('ROBOTSTXT_OBEY') is True
103
+ assert crawler.settings.get('CONCURRENT_REQUESTS') == 16
104
+ assert crawler.settings.get('DOWNLOAD_DELAY') == 1
105
+
106
+ @patch('backend.app.crawler.CrawlerProcess')
107
+ def test_domain_crawler_start(mock_crawler_process):
108
+ """Test starting the domain crawler"""
109
+ start_url = "https://example.com"
110
+ output_dir = "test_output"
111
+
112
+ crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
113
+ crawler.start()
114
+
115
+ # Verify that CrawlerProcess was instantiated and crawl was started
116
+ mock_crawler_process.assert_called_once_with(crawler.settings)
117
+ mock_crawler_process.return_value.crawl.assert_called_once()
118
+ mock_crawler_process.return_value.start.assert_called_once()
119
+
120
+ def test_output_directory_creation():
121
+ """Test that output directory is created if it doesn't exist"""
122
+ start_url = "https://example.com"
123
+ output_dir = "test_output_dir"
124
+
125
+ # Ensure directory doesn't exist
126
+ if os.path.exists(output_dir):
127
+ os.rmdir(output_dir)
128
+
129
+ crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
130
+ assert os.path.exists(output_dir)
131
+
132
+ # Cleanup
133
+ os.rmdir(output_dir)