AmmarFahmy
adding all files
105b369
from typing import List, Optional, Iterator, Dict, Any
from pydantic import BaseModel, ConfigDict
from phi.document import Document
from phi.document.reader.base import Reader
from phi.vectordb import VectorDb
from phi.utils.log import logger
class AssistantKnowledge(BaseModel):
"""Base class for LLM knowledge base"""
# Reader to read the documents
reader: Optional[Reader] = None
# Vector db to store the knowledge base
vector_db: Optional[VectorDb] = None
# Number of relevant documents to return on search
num_documents: int = 5
# Number of documents to optimize the vector db on
optimize_on: Optional[int] = 1000
model_config = ConfigDict(arbitrary_types_allowed=True)
@property
def document_lists(self) -> Iterator[List[Document]]:
"""Iterator that yields lists of documents in the knowledge base
Each object yielded by the iterator is a list of documents.
"""
raise NotImplementedError
def search(self, query: str, num_documents: Optional[int] = None) -> List[Document]:
"""Returns relevant documents matching the query"""
try:
if self.vector_db is None:
logger.warning("No vector db provided")
return []
_num_documents = num_documents or self.num_documents
logger.debug(f"Getting {_num_documents} relevant documents for query: {query}")
return self.vector_db.search(query=query, limit=_num_documents)
except Exception as e:
logger.error(f"Error searching for documents: {e}")
return []
def load(self, recreate: bool = False, upsert: bool = False, skip_existing: bool = True) -> None:
"""Load the knowledge base to the vector db
Args:
recreate (bool): If True, recreates the collection in the vector db. Defaults to False.
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db when inserting. Defaults to True.
"""
if self.vector_db is None:
logger.warning("No vector db provided")
return
if recreate:
logger.info("Deleting collection")
self.vector_db.delete()
logger.info("Creating collection")
self.vector_db.create()
logger.info("Loading knowledge base")
num_documents = 0
for document_list in self.document_lists:
documents_to_load = document_list
# Upsert documents if upsert is True and vector db supports upsert
if upsert and self.vector_db.upsert_available():
self.vector_db.upsert(documents=documents_to_load)
# Insert documents
else:
# Filter out documents which already exist in the vector db
if skip_existing:
documents_to_load = [
document for document in document_list if not self.vector_db.doc_exists(document)
]
self.vector_db.insert(documents=documents_to_load)
num_documents += len(documents_to_load)
logger.info(f"Added {len(documents_to_load)} documents to knowledge base")
if self.optimize_on is not None and num_documents > self.optimize_on:
logger.info("Optimizing Vector DB")
self.vector_db.optimize()
def load_documents(self, documents: List[Document], upsert: bool = False, skip_existing: bool = True) -> None:
"""Load documents to the knowledge base
Args:
documents (List[Document]): List of documents to load
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db when inserting. Defaults to True.
"""
logger.info("Loading knowledge base")
if self.vector_db is None:
logger.warning("No vector db provided")
return
logger.debug("Creating collection")
self.vector_db.create()
# Upsert documents if upsert is True
if upsert and self.vector_db.upsert_available():
self.vector_db.upsert(documents=documents)
logger.info(f"Loaded {len(documents)} documents to knowledge base")
return
# Filter out documents which already exist in the vector db
documents_to_load = (
[document for document in documents if not self.vector_db.doc_exists(document)]
if skip_existing
else documents
)
# Insert documents
if len(documents_to_load) > 0:
self.vector_db.insert(documents=documents_to_load)
logger.info(f"Loaded {len(documents_to_load)} documents to knowledge base")
else:
logger.info("No new documents to load")
def load_document(self, document: Document, upsert: bool = False, skip_existing: bool = True) -> None:
"""Load a document to the knowledge base
Args:
document (Document): Document to load
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True.
"""
self.load_documents(documents=[document], upsert=upsert, skip_existing=skip_existing)
def load_dict(self, document: Dict[str, Any], upsert: bool = False, skip_existing: bool = True) -> None:
"""Load a dictionary representation of a document to the knowledge base
Args:
document (Dict[str, Any]): Dictionary representation of a document
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True.
"""
self.load_documents(documents=[Document.from_dict(document)], upsert=upsert, skip_existing=skip_existing)
def load_json(self, document: str, upsert: bool = False, skip_existing: bool = True) -> None:
"""Load a json representation of a document to the knowledge base
Args:
document (str): Json representation of a document
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True.
"""
self.load_documents(documents=[Document.from_json(document)], upsert=upsert, skip_existing=skip_existing)
def load_text(self, text: str, upsert: bool = False, skip_existing: bool = True) -> None:
"""Load a text to the knowledge base
Args:
text (str): Text to load to the knowledge base
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True.
"""
self.load_documents(documents=[Document(content=text)], upsert=upsert, skip_existing=skip_existing)
def exists(self) -> bool:
"""Returns True if the knowledge base exists"""
if self.vector_db is None:
logger.warning("No vector db provided")
return False
return self.vector_db.exists()
def clear(self) -> bool:
"""Clear the knowledge base"""
if self.vector_db is None:
logger.warning("No vector db available")
return True
return self.vector_db.clear()