import time import random from typing import Set, Dict, List, Tuple from urllib.parse import urljoin, urlparse from phi.document.base import Document from phi.document.reader.base import Reader from phi.utils.log import logger import httpx try: from bs4 import BeautifulSoup # noqa: F401 except ImportError: raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") class WebsiteReader(Reader): """Reader for Websites""" max_depth: int = 3 max_links: int = 10 _visited: Set[str] = set() _urls_to_crawl: List[Tuple[str, int]] = [] def delay(self, min_seconds=1, max_seconds=3): """ Introduce a random delay. :param min_seconds: Minimum number of seconds to delay. Default is 1. :param max_seconds: Maximum number of seconds to delay. Default is 3. """ sleep_time = random.uniform(min_seconds, max_seconds) time.sleep(sleep_time) def _get_primary_domain(self, url: str) -> str: """ Extract primary domain from the given URL. :param url: The URL to extract the primary domain from. :return: The primary domain. """ domain_parts = urlparse(url).netloc.split(".") # Return primary domain (excluding subdomains) return ".".join(domain_parts[-2:]) def _extract_main_content(self, soup: BeautifulSoup) -> str: """ Extracts the main content from a BeautifulSoup object. :param soup: The BeautifulSoup object to extract the main content from. :return: The main content. """ # Try to find main content by specific tags or class names for tag in ["article", "main"]: element = soup.find(tag) if element: return element.get_text(strip=True, separator=" ") for class_name in ["content", "main-content", "post-content"]: element = soup.find(class_=class_name) if element: return element.get_text(strip=True, separator=" ") return "" def crawl(self, url: str, starting_depth: int = 1) -> Dict[str, str]: """ Crawls a website and returns a dictionary of URLs and their corresponding content. Parameters: - url (str): The starting URL to begin the crawl. - starting_depth (int, optional): The starting depth level for the crawl. Defaults to 1. Returns: - Dict[str, str]: A dictionary where each key is a URL and the corresponding value is the main content extracted from that URL. Note: The function focuses on extracting the main content by prioritizing content inside common HTML tags like `
`, `
`, and `
` with class names such as "content", "main-content", etc. The crawler will also respect the `max_depth` attribute of the WebCrawler class, ensuring it does not crawl deeper than the specified depth. """ num_links = 0 crawler_result: Dict[str, str] = {} primary_domain = self._get_primary_domain(url) # Add starting URL with its depth to the global list self._urls_to_crawl.append((url, starting_depth)) while self._urls_to_crawl: # Unpack URL and depth from the global list current_url, current_depth = self._urls_to_crawl.pop(0) # Skip if # - URL is already visited # - does not end with the primary domain, # - exceeds max depth # - exceeds max links if ( current_url in self._visited or not urlparse(current_url).netloc.endswith(primary_domain) or current_depth > self.max_depth or num_links >= self.max_links ): continue self._visited.add(current_url) self.delay() try: logger.debug(f"Crawling: {current_url}") response = httpx.get(current_url, timeout=10) soup = BeautifulSoup(response.content, "html.parser") # Extract main content main_content = self._extract_main_content(soup) if main_content: crawler_result[current_url] = main_content num_links += 1 # Add found URLs to the global list, with incremented depth for link in soup.find_all("a", href=True): full_url = urljoin(current_url, link["href"]) parsed_url = urlparse(full_url) if parsed_url.netloc.endswith(primary_domain) and not any( parsed_url.path.endswith(ext) for ext in [".pdf", ".jpg", ".png"] ): if full_url not in self._visited and (full_url, current_depth + 1) not in self._urls_to_crawl: self._urls_to_crawl.append((full_url, current_depth + 1)) except Exception as e: logger.debug(f"Failed to crawl: {current_url}: {e}") pass return crawler_result def read(self, url: str) -> List[Document]: """ Reads a website and returns a list of documents. This function first converts the website into a dictionary of URLs and their corresponding content. Then iterates through the dictionary and returns chunks of content. :param url: The URL of the website to read. :return: A list of documents. """ logger.debug(f"Reading: {url}") crawler_result = self.crawl(url) documents = [] for crawled_url, crawled_content in crawler_result.items(): if self.chunk: documents.extend( self.chunk_document( Document( name=url, id=str(crawled_url), meta_data={"url": str(crawled_url)}, content=crawled_content ) ) ) else: documents.append( Document( name=url, id=str(crawled_url), meta_data={"url": str(crawled_url)}, content=crawled_content, ) ) return documents