Spaces:
Runtime error
Runtime error
from typing import Iterator, List, Optional | |
from pydantic import model_validator | |
from phi.document import Document | |
from phi.document.reader.website import WebsiteReader | |
from phi.knowledge.base import AssistantKnowledge | |
from phi.utils.log import logger | |
class WebsiteKnowledgeBase(AssistantKnowledge): | |
urls: List[str] = [] | |
reader: Optional[WebsiteReader] = None | |
# WebsiteReader parameters | |
max_depth: int = 3 | |
max_links: int = 10 | |
# type: ignore | |
def set_reader(self) -> "WebsiteKnowledgeBase": | |
if self.reader is None: | |
self.reader = WebsiteReader(max_depth=self.max_depth, max_links=self.max_links) | |
return self # type: ignore | |
def document_lists(self) -> Iterator[List[Document]]: | |
"""Iterate over urls and yield lists of documents. | |
Each object yielded by the iterator is a list of documents. | |
Returns: | |
Iterator[List[Document]]: Iterator yielding list of documents | |
""" | |
if self.reader is not None: | |
for _url in self.urls: | |
yield self.reader.read(url=_url) | |
def load(self, recreate: bool = False, upsert: bool = True, skip_existing: bool = True) -> None: | |
"""Load the website contents to the vector db""" | |
if self.vector_db is None: | |
logger.warning("No vector db provided") | |
return | |
if self.reader is None: | |
logger.warning("No reader provided") | |
return | |
if recreate: | |
logger.debug("Deleting collection") | |
self.vector_db.delete() | |
logger.debug("Creating collection") | |
self.vector_db.create() | |
logger.info("Loading knowledge base") | |
num_documents = 0 | |
# Given that the crawler needs to parse the URL before existence can be checked | |
# We check if the website url exists in the vector db if recreate is False | |
urls_to_read = self.urls.copy() | |
if not recreate: | |
for url in urls_to_read: | |
logger.debug(f"Checking if {url} exists in the vector db") | |
if self.vector_db.name_exists(name=url): | |
logger.debug(f"Skipping {url} as it exists in the vector db") | |
urls_to_read.remove(url) | |
for url in urls_to_read: | |
document_list = self.reader.read(url=url) | |
# Filter out documents which already exist in the vector db | |
if not recreate: | |
document_list = [document for document in document_list if not self.vector_db.doc_exists(document)] | |
self.vector_db.insert(documents=document_list) | |
num_documents += len(document_list) | |
logger.info(f"Loaded {num_documents} documents to knowledge base") | |
if self.optimize_on is not None and num_documents > self.optimize_on: | |
logger.debug("Optimizing Vector DB") | |
self.vector_db.optimize() | |