Spaces:

Rsr2425
/

SimpliFi

Sleeping

App Files Files Community

Rsr2425 commited on Mar 26

Commit

323db03

1 Parent(s): 45884d3

Added outlines of crawler code

Browse files

Files changed (3) hide show

backend/app/crawler.py +57 -36
backend/app/main.py +1 -1
backend/tests/test_crawler.py +137 -0

backend/app/crawler.py CHANGED Viewed

@@ -1,44 +1,39 @@
-from spidy import crawler
 import os
 import re
 import logging
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
 logger = logging.getLogger(__name__)
-class DomainCrawler:
-    def __init__(self, start_url, output_dir="crawled_content"):
-        self.start_url = start_url
-        self.domain = urlparse(start_url).netloc
-        self.output_dir = output_dir
-        # Create output directory if it doesn't exist
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-            logger.info(f"Created output directory: {output_dir}")
-        # Initialize the crawler
-        self.crawler = crawler.Crawler(
-            start_url=start_url,
-            max_pages=1000,
-            timeout=10,
-            delay=0.5,
-            save_pages=True,
-            save_path=output_dir,
-            restrict_domain=True,
-            verbose=True,
         )
-        # Set custom handlers
-        self.crawler.page_handler = self.process_page
-    def process_page(self, url, content):
-        """Custom page processor that extracts and saves content"""
         try:
-            # Parse the HTML
-            soup = BeautifulSoup(content, "html.parser")
             # Extract the title
             title = soup.title.string if soup.title else "No Title"
@@ -47,7 +42,7 @@ class DomainCrawler:
             filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
             filepath = os.path.join(self.output_dir, filename)
-            # Extract main content (this is just an example - adjust for your site)
             main_content = (
                 soup.find("main")
                 or soup.find("article")
@@ -65,28 +60,54 @@ class DomainCrawler:
                     else "No content"
                 )
                 logger.warning(
-                    f"No main content found for {url}, falling back to body text"
                 )
             # Save the extracted content
             with open(filepath, "w", encoding="utf-8") as f:
-                f.write(f"URL: {url}\n")
                 f.write(f"Title: {title}\n\n")
                 f.write(text_content)
-            logger.info(f"Saved content from {url} to {filepath}")
         except Exception as e:
-            logger.error(f"Error processing {url}: {e}", exc_info=True)
-        return content  # Return the original content for the crawler to continue
     def start(self):
         """Start the crawling process"""
         logger.info(f"Starting crawl from {self.start_url}")
-        self.crawler.crawl()
-        # Print summary
         logger.info("\nCrawl completed!")
-        logger.info(f"Pages crawled: {len(self.crawler.links_crawled)}")
         logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")

 import os
 import re
 import logging
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse
 logger = logging.getLogger(__name__)
+class WebsiteSpider(CrawlSpider):
+    name = "website_spider"
+    def __init__(self, start_url, output_dir, *args, **kwargs):
+        self.start_urls = [start_url]
+        self.allowed_domains = [urlparse(start_url).netloc]
+        self.output_dir = output_dir
+        # Define rules for link extraction
+        self.rules = (
+            Rule(
+                LinkExtractor(allow_domains=self.allowed_domains),
+                callback="parse_item",
+                follow=True,
+            ),
         )
+        super().__init__(*args, **kwargs)
+    def parse_item(self, response):
         try:
+            # Parse the HTML with BeautifulSoup
+            soup = BeautifulSoup(response.body, "html.parser")
             # Extract the title
             title = soup.title.string if soup.title else "No Title"
             filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
             filepath = os.path.join(self.output_dir, filename)
+            # Extract main content
             main_content = (
                 soup.find("main")
                 or soup.find("article")
                     else "No content"
                 )
                 logger.warning(
+                    f"No main content found for {response.url}, falling back to body text"
                 )
             # Save the extracted content
             with open(filepath, "w", encoding="utf-8") as f:
+                f.write(f"URL: {response.url}\n")
                 f.write(f"Title: {title}\n\n")
                 f.write(text_content)
+            logger.info(f"Saved content from {response.url} to {filepath}")
         except Exception as e:
+            logger.error(f"Error processing {response.url}: {e}", exc_info=True)
+class DomainCrawler:
+    def __init__(self, start_url, output_dir="crawled_content"):
+        self.start_url = start_url
+        self.domain = urlparse(start_url).netloc
+        self.output_dir = output_dir
+        # Create output directory if it doesn't exist
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+            logger.info(f"Created output directory: {output_dir}")
+        # Configure Scrapy settings
+        self.settings = get_project_settings()
+        self.settings.update(
+            {
+                "BOT_NAME": "website_crawler",
+                "ROBOTSTXT_OBEY": True,
+                "CONCURRENT_REQUESTS": 16,
+                "DOWNLOAD_DELAY": 1,
+                "COOKIES_ENABLED": False,
+                "USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)",
+            }
+        )
     def start(self):
         """Start the crawling process"""
         logger.info(f"Starting crawl from {self.start_url}")
+        process = CrawlerProcess(self.settings)
+        process.crawl(
+            WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir
+        )
+        process.start()
         logger.info("\nCrawl completed!")
         logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")

backend/app/main.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Dict, List
 import asyncio
 import logging
 import os
-from crawler import DomainCrawler
 app = FastAPI()

 import asyncio
 import logging
 import os
+from backend.app.crawler import DomainCrawler
 app = FastAPI()

backend/tests/test_crawler.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import pytest
+import os
+from unittest.mock import Mock, patch
+from bs4 import BeautifulSoup
+from scrapy.http import Response, Request, TextResponse
+from backend.app.crawler import DomainCrawler, WebsiteSpider
+@pytest.fixture
+def sample_html():
+    return """
+    <html>
+        <head><title>Test Page</title></head>
+        <body>
+            <main>
+                <h1>Main Content</h1>
+                <p>This is the main content of the page.</p>
+            </main>
+        </body>
+    </html>
+    """
+@pytest.fixture
+def crawler():
+    return DomainCrawler("https://example.com", output_dir="test_output")
+@pytest.fixture
+def spider():
+    return WebsiteSpider(start_url="https://example.com", output_dir="test_output")
+# def test_crawler_initialization(crawler):
+#     assert crawler.start_url == "https://example.com"
+#     assert crawler.domain == "example.com"
+#     assert crawler.output_dir == "test_output"
+#     assert os.path.exists("test_output")
+#     # Test Scrapy settings
+#     assert crawler.settings.get('BOT_NAME') == 'website_crawler'
+#     assert crawler.settings.get('ROBOTSTXT_OBEY') is True
+#     assert crawler.settings.get('DOWNLOAD_DELAY') == 1
+def create_response(url, body):
+    request = Request(url=url)
+    return TextResponse(
+        url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
+    )
+# def test_spider_parse_with_main_content(spider, sample_html):
+#     url = "https://example.com/test"
+#     response = create_response(url, sample_html)
+#     # Process the page
+#     list(spider.parse_item(response))
+#     # Check if file was created
+#     files = os.listdir(spider.output_dir)
+#     assert len(files) == 1
+#     # Read the saved file
+#     with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
+#         content = f.read()
+#     # Verify content
+#     assert "URL: https://example.com/test" in content
+#     assert "Title: Test Page" in content
+#     assert "Main Content" in content
+#     assert "This is the main content of the page." in content
+# def test_spider_parse_without_main_content(spider):
+#     html_without_main = """
+#     <html>
+#         <head><title>No Main</title></head>
+#         <body>
+#             <div>Some body content</div>
+#         </body>
+#     </html>
+#     """
+#     url = "https://example.com/no-main"
+#     response = create_response(url, html_without_main)
+#     # Process the page
+#     list(spider.parse_item(response))
+#     files = os.listdir(spider.output_dir)
+#     assert len(files) == 1
+#     with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
+#         content = f.read()
+#     assert "URL: https://example.com/no-main" in content
+#     assert "Title: No Main" in content
+#     assert "Some body content" in content
+# def test_spider_parse_with_invalid_html(spider):
+#     invalid_html = "<invalid><<html>"
+#     url = "https://example.com/invalid"
+#     response = create_response(url, invalid_html)
+#     # Process should not raise an exception
+#     list(spider.parse_item(response))
+#     # Should still create a file
+#     files = os.listdir(spider.output_dir)
+#     assert len(files) == 1
+# @patch('scrapy.crawler.CrawlerProcess')
+# def test_start_crawling(mock_crawler_process_class, crawler):
+#     # Configure the mock
+#     mock_process = Mock()
+#     mock_crawler_process_class.return_value = mock_process
+#     # Start crawling
+#     crawler.start()
+#     # Verify process was created with correct settings
+#     mock_crawler_process_class.assert_called_once_with(crawler.settings)
+#     # Verify crawl method was called
+#     mock_process.crawl.assert_called_once()
+#     mock_process.start.assert_called_once()
+@pytest.fixture(autouse=True)
+def cleanup():
+    # Setup - nothing needed
+    yield
+    # Cleanup after each test
+    if os.path.exists("test_output"):
+        for file in os.listdir("test_output"):
+            os.remove(os.path.join("test_output", file))
+        os.rmdir("test_output")