Spaces:
Runtime error
Runtime error
import time | |
import random | |
from typing import Set, Dict, List, Tuple | |
from urllib.parse import urljoin, urlparse | |
from phi.document.base import Document | |
from phi.document.reader.base import Reader | |
from phi.utils.log import logger | |
import httpx | |
try: | |
from bs4 import BeautifulSoup # noqa: F401 | |
except ImportError: | |
raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") | |
class WebsiteReader(Reader): | |
"""Reader for Websites""" | |
max_depth: int = 3 | |
max_links: int = 10 | |
_visited: Set[str] = set() | |
_urls_to_crawl: List[Tuple[str, int]] = [] | |
def delay(self, min_seconds=1, max_seconds=3): | |
""" | |
Introduce a random delay. | |
:param min_seconds: Minimum number of seconds to delay. Default is 1. | |
:param max_seconds: Maximum number of seconds to delay. Default is 3. | |
""" | |
sleep_time = random.uniform(min_seconds, max_seconds) | |
time.sleep(sleep_time) | |
def _get_primary_domain(self, url: str) -> str: | |
""" | |
Extract primary domain from the given URL. | |
:param url: The URL to extract the primary domain from. | |
:return: The primary domain. | |
""" | |
domain_parts = urlparse(url).netloc.split(".") | |
# Return primary domain (excluding subdomains) | |
return ".".join(domain_parts[-2:]) | |
def _extract_main_content(self, soup: BeautifulSoup) -> str: | |
""" | |
Extracts the main content from a BeautifulSoup object. | |
:param soup: The BeautifulSoup object to extract the main content from. | |
:return: The main content. | |
""" | |
# Try to find main content by specific tags or class names | |
for tag in ["article", "main"]: | |
element = soup.find(tag) | |
if element: | |
return element.get_text(strip=True, separator=" ") | |
for class_name in ["content", "main-content", "post-content"]: | |
element = soup.find(class_=class_name) | |
if element: | |
return element.get_text(strip=True, separator=" ") | |
return "" | |
def crawl(self, url: str, starting_depth: int = 1) -> Dict[str, str]: | |
""" | |
Crawls a website and returns a dictionary of URLs and their corresponding content. | |
Parameters: | |
- url (str): The starting URL to begin the crawl. | |
- starting_depth (int, optional): The starting depth level for the crawl. Defaults to 1. | |
Returns: | |
- Dict[str, str]: A dictionary where each key is a URL and the corresponding value is the main | |
content extracted from that URL. | |
Note: | |
The function focuses on extracting the main content by prioritizing content inside common HTML tags | |
like `<article>`, `<main>`, and `<div>` with class names such as "content", "main-content", etc. | |
The crawler will also respect the `max_depth` attribute of the WebCrawler class, ensuring it does not | |
crawl deeper than the specified depth. | |
""" | |
num_links = 0 | |
crawler_result: Dict[str, str] = {} | |
primary_domain = self._get_primary_domain(url) | |
# Add starting URL with its depth to the global list | |
self._urls_to_crawl.append((url, starting_depth)) | |
while self._urls_to_crawl: | |
# Unpack URL and depth from the global list | |
current_url, current_depth = self._urls_to_crawl.pop(0) | |
# Skip if | |
# - URL is already visited | |
# - does not end with the primary domain, | |
# - exceeds max depth | |
# - exceeds max links | |
if ( | |
current_url in self._visited | |
or not urlparse(current_url).netloc.endswith(primary_domain) | |
or current_depth > self.max_depth | |
or num_links >= self.max_links | |
): | |
continue | |
self._visited.add(current_url) | |
self.delay() | |
try: | |
logger.debug(f"Crawling: {current_url}") | |
response = httpx.get(current_url, timeout=10) | |
soup = BeautifulSoup(response.content, "html.parser") | |
# Extract main content | |
main_content = self._extract_main_content(soup) | |
if main_content: | |
crawler_result[current_url] = main_content | |
num_links += 1 | |
# Add found URLs to the global list, with incremented depth | |
for link in soup.find_all("a", href=True): | |
full_url = urljoin(current_url, link["href"]) | |
parsed_url = urlparse(full_url) | |
if parsed_url.netloc.endswith(primary_domain) and not any( | |
parsed_url.path.endswith(ext) for ext in [".pdf", ".jpg", ".png"] | |
): | |
if full_url not in self._visited and (full_url, current_depth + 1) not in self._urls_to_crawl: | |
self._urls_to_crawl.append((full_url, current_depth + 1)) | |
except Exception as e: | |
logger.debug(f"Failed to crawl: {current_url}: {e}") | |
pass | |
return crawler_result | |
def read(self, url: str) -> List[Document]: | |
""" | |
Reads a website and returns a list of documents. | |
This function first converts the website into a dictionary of URLs and their corresponding content. | |
Then iterates through the dictionary and returns chunks of content. | |
:param url: The URL of the website to read. | |
:return: A list of documents. | |
""" | |
logger.debug(f"Reading: {url}") | |
crawler_result = self.crawl(url) | |
documents = [] | |
for crawled_url, crawled_content in crawler_result.items(): | |
if self.chunk: | |
documents.extend( | |
self.chunk_document( | |
Document( | |
name=url, id=str(crawled_url), meta_data={"url": str(crawled_url)}, content=crawled_content | |
) | |
) | |
) | |
else: | |
documents.append( | |
Document( | |
name=url, | |
id=str(crawled_url), | |
meta_data={"url": str(crawled_url)}, | |
content=crawled_content, | |
) | |
) | |
return documents | |