Rsr2425 commited on
Commit
26b3c2a
·
1 Parent(s): e344fab

Fix broken test and code style

Browse files
backend/app/crawler.py CHANGED
@@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
20
  class WebsiteSpider(CrawlSpider):
21
  """
22
  A Scrapy spider for crawling documentation websites and extracting content.
23
-
24
  This spider follows links within the allowed domain and extracts the main content
25
  from each page, saving it to a text file. It attempts to find content within <main>,
26
  <article> or content div tags, falling back to the full body if none are found.
@@ -36,7 +36,7 @@ class WebsiteSpider(CrawlSpider):
36
 
37
  def __init__(self, start_url, output_dir, *args, **kwargs):
38
  super().__init__(*args, **kwargs)
39
-
40
  self.start_urls = [start_url]
41
  self.allowed_domains = [urlparse(start_url).netloc]
42
  self.output_dir = output_dir
@@ -52,7 +52,6 @@ class WebsiteSpider(CrawlSpider):
52
  ),
53
  )
54
 
55
-
56
  def parse_item(self, response):
57
  """
58
  Parse a webpage and extract its content.
@@ -149,7 +148,7 @@ class DomainCrawler:
149
  def start(self):
150
  """
151
  Start the crawling process.
152
-
153
  This method initiates the crawler and blocks until crawling is complete.
154
  The extracted content will be saved to the configured output directory.
155
  """
 
20
  class WebsiteSpider(CrawlSpider):
21
  """
22
  A Scrapy spider for crawling documentation websites and extracting content.
23
+
24
  This spider follows links within the allowed domain and extracts the main content
25
  from each page, saving it to a text file. It attempts to find content within <main>,
26
  <article> or content div tags, falling back to the full body if none are found.
 
36
 
37
  def __init__(self, start_url, output_dir, *args, **kwargs):
38
  super().__init__(*args, **kwargs)
39
+
40
  self.start_urls = [start_url]
41
  self.allowed_domains = [urlparse(start_url).netloc]
42
  self.output_dir = output_dir
 
52
  ),
53
  )
54
 
 
55
  def parse_item(self, response):
56
  """
57
  Parse a webpage and extract its content.
 
148
  def start(self):
149
  """
150
  Start the crawling process.
151
+
152
  This method initiates the crawler and blocks until crawling is complete.
153
  The extracted content will be saved to the configured output directory.
154
  """
backend/tests/test_api.py CHANGED
@@ -5,7 +5,13 @@ client = TestClient(app)
5
 
6
 
7
  def test_crawl_endpoint():
8
- response = client.post("/api/ingest/", json={"url": "https://example.com"})
 
 
 
 
 
 
9
  assert response.status_code == 200
10
  assert response.json() == {"status": "RECEIVED"}
11
 
 
5
 
6
 
7
  def test_crawl_endpoint():
8
+ response = client.post(
9
+ "/api/ingest/",
10
+ json={
11
+ "url": "https://example.com",
12
+ "topic": "LangChain RAG Tutorial",
13
+ },
14
+ )
15
  assert response.status_code == 200
16
  assert response.json() == {"status": "RECEIVED"}
17
 
backend/tests/test_crawler.py CHANGED
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
5
  from scrapy.http import Response, Request
6
  from backend.app.crawler import WebsiteSpider, DomainCrawler
7
 
 
8
  @pytest.fixture
9
  def sample_html():
10
  return """
@@ -19,47 +20,51 @@ def sample_html():
19
  </html>
20
  """
21
 
 
22
  @pytest.fixture
23
  def output_dir(tmp_path):
24
  """Create a temporary directory for test outputs"""
25
  return str(tmp_path / "test_crawled_content")
26
 
 
27
  def test_website_spider_initialization():
28
  """Test WebsiteSpider initialization with correct parameters"""
29
  start_url = "https://example.com"
30
  output_dir = "test_output"
31
-
32
  spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
33
-
34
  assert spider.start_urls == [start_url]
35
  assert spider.allowed_domains == ["example.com"]
36
  assert spider.output_dir == output_dir
37
  assert len(spider.rules) == 1
38
 
 
39
  def test_parse_item_with_main_content(sample_html, output_dir):
40
  """Test parsing a page with main content section"""
41
  start_url = "https://example.com"
42
  spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
43
-
44
  # Create a mock response
45
  mock_response = Mock(spec=Response)
46
  mock_response.url = "https://example.com/test"
47
- mock_response.body = sample_html.encode('utf-8')
48
-
49
  # Process the mock response
50
  spider.parse_item(mock_response)
51
-
52
  # Check if file was created and contains correct content
53
  files = os.listdir(output_dir)
54
  assert len(files) == 1
55
-
56
- with open(os.path.join(output_dir, files[0]), 'r', encoding='utf-8') as f:
57
  content = f.read()
58
  assert "Test Page" in content
59
  assert "Main Content" in content
60
  assert "This is the main content" in content
61
  assert "URL: https://example.com/test" in content
62
 
 
63
  def test_parse_item_without_main_content(output_dir):
64
  """Test parsing a page without main content section"""
65
  html_without_main = """
@@ -70,64 +75,67 @@ def test_parse_item_without_main_content(output_dir):
70
  </body>
71
  </html>
72
  """
73
-
74
  start_url = "https://example.com"
75
  spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
76
-
77
  mock_response = Mock(spec=Response)
78
  mock_response.url = "https://example.com/no-main"
79
- mock_response.body = html_without_main.encode('utf-8')
80
-
81
  spider.parse_item(mock_response)
82
-
83
  files = os.listdir(output_dir)
84
  assert len(files) == 1
85
-
86
- with open(os.path.join(output_dir, files[0]), 'r', encoding='utf-8') as f:
87
  content = f.read()
88
  assert "No Main Page" in content
89
  assert "Some body content" in content
90
 
 
91
  def test_domain_crawler_initialization():
92
  """Test DomainCrawler initialization"""
93
  start_url = "https://example.com"
94
  output_dir = "test_output"
95
-
96
  crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
97
-
98
  assert crawler.start_url == start_url
99
  assert crawler.domain == "example.com"
100
  assert crawler.output_dir == output_dir
101
- assert crawler.settings.get('BOT_NAME') == "website_crawler"
102
- assert crawler.settings.get('ROBOTSTXT_OBEY') is True
103
- assert crawler.settings.get('CONCURRENT_REQUESTS') == 16
104
- assert crawler.settings.get('DOWNLOAD_DELAY') == 1
 
105
 
106
- @patch('backend.app.crawler.CrawlerProcess')
107
  def test_domain_crawler_start(mock_crawler_process):
108
  """Test starting the domain crawler"""
109
  start_url = "https://example.com"
110
  output_dir = "test_output"
111
-
112
  crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
113
  crawler.start()
114
-
115
  # Verify that CrawlerProcess was instantiated and crawl was started
116
  mock_crawler_process.assert_called_once_with(crawler.settings)
117
  mock_crawler_process.return_value.crawl.assert_called_once()
118
  mock_crawler_process.return_value.start.assert_called_once()
119
 
 
120
  def test_output_directory_creation():
121
  """Test that output directory is created if it doesn't exist"""
122
  start_url = "https://example.com"
123
  output_dir = "test_output_dir"
124
-
125
  # Ensure directory doesn't exist
126
  if os.path.exists(output_dir):
127
  os.rmdir(output_dir)
128
-
129
  crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
130
  assert os.path.exists(output_dir)
131
-
132
  # Cleanup
133
- os.rmdir(output_dir)
 
5
  from scrapy.http import Response, Request
6
  from backend.app.crawler import WebsiteSpider, DomainCrawler
7
 
8
+
9
  @pytest.fixture
10
  def sample_html():
11
  return """
 
20
  </html>
21
  """
22
 
23
+
24
  @pytest.fixture
25
  def output_dir(tmp_path):
26
  """Create a temporary directory for test outputs"""
27
  return str(tmp_path / "test_crawled_content")
28
 
29
+
30
  def test_website_spider_initialization():
31
  """Test WebsiteSpider initialization with correct parameters"""
32
  start_url = "https://example.com"
33
  output_dir = "test_output"
34
+
35
  spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
36
+
37
  assert spider.start_urls == [start_url]
38
  assert spider.allowed_domains == ["example.com"]
39
  assert spider.output_dir == output_dir
40
  assert len(spider.rules) == 1
41
 
42
+
43
  def test_parse_item_with_main_content(sample_html, output_dir):
44
  """Test parsing a page with main content section"""
45
  start_url = "https://example.com"
46
  spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
47
+
48
  # Create a mock response
49
  mock_response = Mock(spec=Response)
50
  mock_response.url = "https://example.com/test"
51
+ mock_response.body = sample_html.encode("utf-8")
52
+
53
  # Process the mock response
54
  spider.parse_item(mock_response)
55
+
56
  # Check if file was created and contains correct content
57
  files = os.listdir(output_dir)
58
  assert len(files) == 1
59
+
60
+ with open(os.path.join(output_dir, files[0]), "r", encoding="utf-8") as f:
61
  content = f.read()
62
  assert "Test Page" in content
63
  assert "Main Content" in content
64
  assert "This is the main content" in content
65
  assert "URL: https://example.com/test" in content
66
 
67
+
68
  def test_parse_item_without_main_content(output_dir):
69
  """Test parsing a page without main content section"""
70
  html_without_main = """
 
75
  </body>
76
  </html>
77
  """
78
+
79
  start_url = "https://example.com"
80
  spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
81
+
82
  mock_response = Mock(spec=Response)
83
  mock_response.url = "https://example.com/no-main"
84
+ mock_response.body = html_without_main.encode("utf-8")
85
+
86
  spider.parse_item(mock_response)
87
+
88
  files = os.listdir(output_dir)
89
  assert len(files) == 1
90
+
91
+ with open(os.path.join(output_dir, files[0]), "r", encoding="utf-8") as f:
92
  content = f.read()
93
  assert "No Main Page" in content
94
  assert "Some body content" in content
95
 
96
+
97
  def test_domain_crawler_initialization():
98
  """Test DomainCrawler initialization"""
99
  start_url = "https://example.com"
100
  output_dir = "test_output"
101
+
102
  crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
103
+
104
  assert crawler.start_url == start_url
105
  assert crawler.domain == "example.com"
106
  assert crawler.output_dir == output_dir
107
+ assert crawler.settings.get("BOT_NAME") == "website_crawler"
108
+ assert crawler.settings.get("ROBOTSTXT_OBEY") is True
109
+ assert crawler.settings.get("CONCURRENT_REQUESTS") == 16
110
+ assert crawler.settings.get("DOWNLOAD_DELAY") == 1
111
+
112
 
113
+ @patch("backend.app.crawler.CrawlerProcess")
114
  def test_domain_crawler_start(mock_crawler_process):
115
  """Test starting the domain crawler"""
116
  start_url = "https://example.com"
117
  output_dir = "test_output"
118
+
119
  crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
120
  crawler.start()
121
+
122
  # Verify that CrawlerProcess was instantiated and crawl was started
123
  mock_crawler_process.assert_called_once_with(crawler.settings)
124
  mock_crawler_process.return_value.crawl.assert_called_once()
125
  mock_crawler_process.return_value.start.assert_called_once()
126
 
127
+
128
  def test_output_directory_creation():
129
  """Test that output directory is created if it doesn't exist"""
130
  start_url = "https://example.com"
131
  output_dir = "test_output_dir"
132
+
133
  # Ensure directory doesn't exist
134
  if os.path.exists(output_dir):
135
  os.rmdir(output_dir)
136
+
137
  crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
138
  assert os.path.exists(output_dir)
139
+
140
  # Cleanup
141
+ os.rmdir(output_dir)