File size: 2,953 Bytes
105b369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from typing import Iterator, List, Optional

from pydantic import model_validator

from phi.document import Document
from phi.document.reader.website import WebsiteReader
from phi.knowledge.base import AssistantKnowledge
from phi.utils.log import logger


class WebsiteKnowledgeBase(AssistantKnowledge):
    urls: List[str] = []
    reader: Optional[WebsiteReader] = None

    # WebsiteReader parameters
    max_depth: int = 3
    max_links: int = 10

    @model_validator(mode="after")  # type: ignore
    def set_reader(self) -> "WebsiteKnowledgeBase":
        if self.reader is None:
            self.reader = WebsiteReader(max_depth=self.max_depth, max_links=self.max_links)
        return self  # type: ignore

    @property
    def document_lists(self) -> Iterator[List[Document]]:
        """Iterate over urls and yield lists of documents.
        Each object yielded by the iterator is a list of documents.

        Returns:
            Iterator[List[Document]]: Iterator yielding list of documents
        """
        if self.reader is not None:
            for _url in self.urls:
                yield self.reader.read(url=_url)

    def load(self, recreate: bool = False, upsert: bool = True, skip_existing: bool = True) -> None:
        """Load the website contents to the vector db"""

        if self.vector_db is None:
            logger.warning("No vector db provided")
            return

        if self.reader is None:
            logger.warning("No reader provided")
            return

        if recreate:
            logger.debug("Deleting collection")
            self.vector_db.delete()

        logger.debug("Creating collection")
        self.vector_db.create()

        logger.info("Loading knowledge base")
        num_documents = 0

        # Given that the crawler needs to parse the URL before existence can be checked
        # We check if the website url exists in the vector db if recreate is False
        urls_to_read = self.urls.copy()
        if not recreate:
            for url in urls_to_read:
                logger.debug(f"Checking if {url} exists in the vector db")
                if self.vector_db.name_exists(name=url):
                    logger.debug(f"Skipping {url} as it exists in the vector db")
                    urls_to_read.remove(url)

        for url in urls_to_read:
            document_list = self.reader.read(url=url)
            # Filter out documents which already exist in the vector db
            if not recreate:
                document_list = [document for document in document_list if not self.vector_db.doc_exists(document)]

            self.vector_db.insert(documents=document_list)
            num_documents += len(document_list)
            logger.info(f"Loaded {num_documents} documents to knowledge base")

        if self.optimize_on is not None and num_documents > self.optimize_on:
            logger.debug("Optimizing Vector DB")
            self.vector_db.optimize()