Spaces:
Runtime error
Runtime error
from typing import List, Optional, Iterator, Dict, Any | |
from pydantic import BaseModel, ConfigDict | |
from phi.document import Document | |
from phi.document.reader.base import Reader | |
from phi.vectordb import VectorDb | |
from phi.utils.log import logger | |
class AssistantKnowledge(BaseModel): | |
"""Base class for LLM knowledge base""" | |
# Reader to read the documents | |
reader: Optional[Reader] = None | |
# Vector db to store the knowledge base | |
vector_db: Optional[VectorDb] = None | |
# Number of relevant documents to return on search | |
num_documents: int = 5 | |
# Number of documents to optimize the vector db on | |
optimize_on: Optional[int] = 1000 | |
model_config = ConfigDict(arbitrary_types_allowed=True) | |
def document_lists(self) -> Iterator[List[Document]]: | |
"""Iterator that yields lists of documents in the knowledge base | |
Each object yielded by the iterator is a list of documents. | |
""" | |
raise NotImplementedError | |
def search(self, query: str, num_documents: Optional[int] = None) -> List[Document]: | |
"""Returns relevant documents matching the query""" | |
try: | |
if self.vector_db is None: | |
logger.warning("No vector db provided") | |
return [] | |
_num_documents = num_documents or self.num_documents | |
logger.debug(f"Getting {_num_documents} relevant documents for query: {query}") | |
return self.vector_db.search(query=query, limit=_num_documents) | |
except Exception as e: | |
logger.error(f"Error searching for documents: {e}") | |
return [] | |
def load(self, recreate: bool = False, upsert: bool = False, skip_existing: bool = True) -> None: | |
"""Load the knowledge base to the vector db | |
Args: | |
recreate (bool): If True, recreates the collection in the vector db. Defaults to False. | |
upsert (bool): If True, upserts documents to the vector db. Defaults to False. | |
skip_existing (bool): If True, skips documents which already exist in the vector db when inserting. Defaults to True. | |
""" | |
if self.vector_db is None: | |
logger.warning("No vector db provided") | |
return | |
if recreate: | |
logger.info("Deleting collection") | |
self.vector_db.delete() | |
logger.info("Creating collection") | |
self.vector_db.create() | |
logger.info("Loading knowledge base") | |
num_documents = 0 | |
for document_list in self.document_lists: | |
documents_to_load = document_list | |
# Upsert documents if upsert is True and vector db supports upsert | |
if upsert and self.vector_db.upsert_available(): | |
self.vector_db.upsert(documents=documents_to_load) | |
# Insert documents | |
else: | |
# Filter out documents which already exist in the vector db | |
if skip_existing: | |
documents_to_load = [ | |
document for document in document_list if not self.vector_db.doc_exists(document) | |
] | |
self.vector_db.insert(documents=documents_to_load) | |
num_documents += len(documents_to_load) | |
logger.info(f"Added {len(documents_to_load)} documents to knowledge base") | |
if self.optimize_on is not None and num_documents > self.optimize_on: | |
logger.info("Optimizing Vector DB") | |
self.vector_db.optimize() | |
def load_documents(self, documents: List[Document], upsert: bool = False, skip_existing: bool = True) -> None: | |
"""Load documents to the knowledge base | |
Args: | |
documents (List[Document]): List of documents to load | |
upsert (bool): If True, upserts documents to the vector db. Defaults to False. | |
skip_existing (bool): If True, skips documents which already exist in the vector db when inserting. Defaults to True. | |
""" | |
logger.info("Loading knowledge base") | |
if self.vector_db is None: | |
logger.warning("No vector db provided") | |
return | |
logger.debug("Creating collection") | |
self.vector_db.create() | |
# Upsert documents if upsert is True | |
if upsert and self.vector_db.upsert_available(): | |
self.vector_db.upsert(documents=documents) | |
logger.info(f"Loaded {len(documents)} documents to knowledge base") | |
return | |
# Filter out documents which already exist in the vector db | |
documents_to_load = ( | |
[document for document in documents if not self.vector_db.doc_exists(document)] | |
if skip_existing | |
else documents | |
) | |
# Insert documents | |
if len(documents_to_load) > 0: | |
self.vector_db.insert(documents=documents_to_load) | |
logger.info(f"Loaded {len(documents_to_load)} documents to knowledge base") | |
else: | |
logger.info("No new documents to load") | |
def load_document(self, document: Document, upsert: bool = False, skip_existing: bool = True) -> None: | |
"""Load a document to the knowledge base | |
Args: | |
document (Document): Document to load | |
upsert (bool): If True, upserts documents to the vector db. Defaults to False. | |
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True. | |
""" | |
self.load_documents(documents=[document], upsert=upsert, skip_existing=skip_existing) | |
def load_dict(self, document: Dict[str, Any], upsert: bool = False, skip_existing: bool = True) -> None: | |
"""Load a dictionary representation of a document to the knowledge base | |
Args: | |
document (Dict[str, Any]): Dictionary representation of a document | |
upsert (bool): If True, upserts documents to the vector db. Defaults to False. | |
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True. | |
""" | |
self.load_documents(documents=[Document.from_dict(document)], upsert=upsert, skip_existing=skip_existing) | |
def load_json(self, document: str, upsert: bool = False, skip_existing: bool = True) -> None: | |
"""Load a json representation of a document to the knowledge base | |
Args: | |
document (str): Json representation of a document | |
upsert (bool): If True, upserts documents to the vector db. Defaults to False. | |
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True. | |
""" | |
self.load_documents(documents=[Document.from_json(document)], upsert=upsert, skip_existing=skip_existing) | |
def load_text(self, text: str, upsert: bool = False, skip_existing: bool = True) -> None: | |
"""Load a text to the knowledge base | |
Args: | |
text (str): Text to load to the knowledge base | |
upsert (bool): If True, upserts documents to the vector db. Defaults to False. | |
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True. | |
""" | |
self.load_documents(documents=[Document(content=text)], upsert=upsert, skip_existing=skip_existing) | |
def exists(self) -> bool: | |
"""Returns True if the knowledge base exists""" | |
if self.vector_db is None: | |
logger.warning("No vector db provided") | |
return False | |
return self.vector_db.exists() | |
def clear(self) -> bool: | |
"""Clear the knowledge base""" | |
if self.vector_db is None: | |
logger.warning("No vector db available") | |
return True | |
return self.vector_db.clear() | |