Rsr2425 commited on
Commit
323db03
·
1 Parent(s): 45884d3

Added outlines of crawler code

Browse files
backend/app/crawler.py CHANGED
@@ -1,44 +1,39 @@
1
- from spidy import crawler
2
  import os
3
  import re
4
  import logging
 
 
 
 
5
  from bs4 import BeautifulSoup
6
- from urllib.parse import urljoin, urlparse
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
 
11
- class DomainCrawler:
12
- def __init__(self, start_url, output_dir="crawled_content"):
13
- self.start_url = start_url
14
- self.domain = urlparse(start_url).netloc
15
- self.output_dir = output_dir
16
 
17
- # Create output directory if it doesn't exist
18
- if not os.path.exists(output_dir):
19
- os.makedirs(output_dir)
20
- logger.info(f"Created output directory: {output_dir}")
21
 
22
- # Initialize the crawler
23
- self.crawler = crawler.Crawler(
24
- start_url=start_url,
25
- max_pages=1000,
26
- timeout=10,
27
- delay=0.5,
28
- save_pages=True,
29
- save_path=output_dir,
30
- restrict_domain=True,
31
- verbose=True,
32
  )
33
 
34
- # Set custom handlers
35
- self.crawler.page_handler = self.process_page
36
 
37
- def process_page(self, url, content):
38
- """Custom page processor that extracts and saves content"""
39
  try:
40
- # Parse the HTML
41
- soup = BeautifulSoup(content, "html.parser")
42
 
43
  # Extract the title
44
  title = soup.title.string if soup.title else "No Title"
@@ -47,7 +42,7 @@ class DomainCrawler:
47
  filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
48
  filepath = os.path.join(self.output_dir, filename)
49
 
50
- # Extract main content (this is just an example - adjust for your site)
51
  main_content = (
52
  soup.find("main")
53
  or soup.find("article")
@@ -65,28 +60,54 @@ class DomainCrawler:
65
  else "No content"
66
  )
67
  logger.warning(
68
- f"No main content found for {url}, falling back to body text"
69
  )
70
 
71
  # Save the extracted content
72
  with open(filepath, "w", encoding="utf-8") as f:
73
- f.write(f"URL: {url}\n")
74
  f.write(f"Title: {title}\n\n")
75
  f.write(text_content)
76
 
77
- logger.info(f"Saved content from {url} to {filepath}")
78
 
79
  except Exception as e:
80
- logger.error(f"Error processing {url}: {e}", exc_info=True)
81
 
82
- return content # Return the original content for the crawler to continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def start(self):
85
  """Start the crawling process"""
86
  logger.info(f"Starting crawl from {self.start_url}")
87
- self.crawler.crawl()
88
 
89
- # Print summary
 
 
 
 
 
90
  logger.info("\nCrawl completed!")
91
- logger.info(f"Pages crawled: {len(self.crawler.links_crawled)}")
92
  logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")
 
 
1
  import os
2
  import re
3
  import logging
4
+ from scrapy.spiders import CrawlSpider, Rule
5
+ from scrapy.linkextractors import LinkExtractor
6
+ from scrapy.crawler import CrawlerProcess
7
+ from scrapy.utils.project import get_project_settings
8
  from bs4 import BeautifulSoup
9
+ from urllib.parse import urlparse
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
 
14
+ class WebsiteSpider(CrawlSpider):
15
+ name = "website_spider"
 
 
 
16
 
17
+ def __init__(self, start_url, output_dir, *args, **kwargs):
18
+ self.start_urls = [start_url]
19
+ self.allowed_domains = [urlparse(start_url).netloc]
20
+ self.output_dir = output_dir
21
 
22
+ # Define rules for link extraction
23
+ self.rules = (
24
+ Rule(
25
+ LinkExtractor(allow_domains=self.allowed_domains),
26
+ callback="parse_item",
27
+ follow=True,
28
+ ),
 
 
 
29
  )
30
 
31
+ super().__init__(*args, **kwargs)
 
32
 
33
+ def parse_item(self, response):
 
34
  try:
35
+ # Parse the HTML with BeautifulSoup
36
+ soup = BeautifulSoup(response.body, "html.parser")
37
 
38
  # Extract the title
39
  title = soup.title.string if soup.title else "No Title"
 
42
  filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
43
  filepath = os.path.join(self.output_dir, filename)
44
 
45
+ # Extract main content
46
  main_content = (
47
  soup.find("main")
48
  or soup.find("article")
 
60
  else "No content"
61
  )
62
  logger.warning(
63
+ f"No main content found for {response.url}, falling back to body text"
64
  )
65
 
66
  # Save the extracted content
67
  with open(filepath, "w", encoding="utf-8") as f:
68
+ f.write(f"URL: {response.url}\n")
69
  f.write(f"Title: {title}\n\n")
70
  f.write(text_content)
71
 
72
+ logger.info(f"Saved content from {response.url} to {filepath}")
73
 
74
  except Exception as e:
75
+ logger.error(f"Error processing {response.url}: {e}", exc_info=True)
76
 
77
+
78
+ class DomainCrawler:
79
+ def __init__(self, start_url, output_dir="crawled_content"):
80
+ self.start_url = start_url
81
+ self.domain = urlparse(start_url).netloc
82
+ self.output_dir = output_dir
83
+
84
+ # Create output directory if it doesn't exist
85
+ if not os.path.exists(output_dir):
86
+ os.makedirs(output_dir)
87
+ logger.info(f"Created output directory: {output_dir}")
88
+
89
+ # Configure Scrapy settings
90
+ self.settings = get_project_settings()
91
+ self.settings.update(
92
+ {
93
+ "BOT_NAME": "website_crawler",
94
+ "ROBOTSTXT_OBEY": True,
95
+ "CONCURRENT_REQUESTS": 16,
96
+ "DOWNLOAD_DELAY": 1,
97
+ "COOKIES_ENABLED": False,
98
+ "USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)",
99
+ }
100
+ )
101
 
102
  def start(self):
103
  """Start the crawling process"""
104
  logger.info(f"Starting crawl from {self.start_url}")
 
105
 
106
+ process = CrawlerProcess(self.settings)
107
+ process.crawl(
108
+ WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir
109
+ )
110
+ process.start()
111
+
112
  logger.info("\nCrawl completed!")
 
113
  logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")
backend/app/main.py CHANGED
@@ -9,7 +9,7 @@ from typing import Dict, List
9
  import asyncio
10
  import logging
11
  import os
12
- from crawler import DomainCrawler
13
 
14
  app = FastAPI()
15
 
 
9
  import asyncio
10
  import logging
11
  import os
12
+ from backend.app.crawler import DomainCrawler
13
 
14
  app = FastAPI()
15
 
backend/tests/test_crawler.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import os
3
+ from unittest.mock import Mock, patch
4
+ from bs4 import BeautifulSoup
5
+ from scrapy.http import Response, Request, TextResponse
6
+ from backend.app.crawler import DomainCrawler, WebsiteSpider
7
+
8
+
9
+ @pytest.fixture
10
+ def sample_html():
11
+ return """
12
+ <html>
13
+ <head><title>Test Page</title></head>
14
+ <body>
15
+ <main>
16
+ <h1>Main Content</h1>
17
+ <p>This is the main content of the page.</p>
18
+ </main>
19
+ </body>
20
+ </html>
21
+ """
22
+
23
+
24
+ @pytest.fixture
25
+ def crawler():
26
+ return DomainCrawler("https://example.com", output_dir="test_output")
27
+
28
+
29
+ @pytest.fixture
30
+ def spider():
31
+ return WebsiteSpider(start_url="https://example.com", output_dir="test_output")
32
+
33
+
34
+ # def test_crawler_initialization(crawler):
35
+ # assert crawler.start_url == "https://example.com"
36
+ # assert crawler.domain == "example.com"
37
+ # assert crawler.output_dir == "test_output"
38
+ # assert os.path.exists("test_output")
39
+
40
+ # # Test Scrapy settings
41
+ # assert crawler.settings.get('BOT_NAME') == 'website_crawler'
42
+ # assert crawler.settings.get('ROBOTSTXT_OBEY') is True
43
+ # assert crawler.settings.get('DOWNLOAD_DELAY') == 1
44
+
45
+
46
+ def create_response(url, body):
47
+ request = Request(url=url)
48
+ return TextResponse(
49
+ url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
50
+ )
51
+
52
+
53
+ # def test_spider_parse_with_main_content(spider, sample_html):
54
+ # url = "https://example.com/test"
55
+ # response = create_response(url, sample_html)
56
+
57
+ # # Process the page
58
+ # list(spider.parse_item(response))
59
+
60
+ # # Check if file was created
61
+ # files = os.listdir(spider.output_dir)
62
+ # assert len(files) == 1
63
+
64
+ # # Read the saved file
65
+ # with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
66
+ # content = f.read()
67
+
68
+ # # Verify content
69
+ # assert "URL: https://example.com/test" in content
70
+ # assert "Title: Test Page" in content
71
+ # assert "Main Content" in content
72
+ # assert "This is the main content of the page." in content
73
+
74
+ # def test_spider_parse_without_main_content(spider):
75
+ # html_without_main = """
76
+ # <html>
77
+ # <head><title>No Main</title></head>
78
+ # <body>
79
+ # <div>Some body content</div>
80
+ # </body>
81
+ # </html>
82
+ # """
83
+
84
+ # url = "https://example.com/no-main"
85
+ # response = create_response(url, html_without_main)
86
+
87
+ # # Process the page
88
+ # list(spider.parse_item(response))
89
+
90
+ # files = os.listdir(spider.output_dir)
91
+ # assert len(files) == 1
92
+
93
+ # with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
94
+ # content = f.read()
95
+
96
+ # assert "URL: https://example.com/no-main" in content
97
+ # assert "Title: No Main" in content
98
+ # assert "Some body content" in content
99
+
100
+ # def test_spider_parse_with_invalid_html(spider):
101
+ # invalid_html = "<invalid><<html>"
102
+ # url = "https://example.com/invalid"
103
+ # response = create_response(url, invalid_html)
104
+
105
+ # # Process should not raise an exception
106
+ # list(spider.parse_item(response))
107
+
108
+ # # Should still create a file
109
+ # files = os.listdir(spider.output_dir)
110
+ # assert len(files) == 1
111
+
112
+ # @patch('scrapy.crawler.CrawlerProcess')
113
+ # def test_start_crawling(mock_crawler_process_class, crawler):
114
+ # # Configure the mock
115
+ # mock_process = Mock()
116
+ # mock_crawler_process_class.return_value = mock_process
117
+
118
+ # # Start crawling
119
+ # crawler.start()
120
+
121
+ # # Verify process was created with correct settings
122
+ # mock_crawler_process_class.assert_called_once_with(crawler.settings)
123
+
124
+ # # Verify crawl method was called
125
+ # mock_process.crawl.assert_called_once()
126
+ # mock_process.start.assert_called_once()
127
+
128
+
129
+ @pytest.fixture(autouse=True)
130
+ def cleanup():
131
+ # Setup - nothing needed
132
+ yield
133
+ # Cleanup after each test
134
+ if os.path.exists("test_output"):
135
+ for file in os.listdir("test_output"):
136
+ os.remove(os.path.join("test_output", file))
137
+ os.rmdir("test_output")