Spaces:
Runtime error
Runtime error
File size: 7,719 Bytes
105b369 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
from typing import List, Optional, Iterator, Dict, Any
from pydantic import BaseModel, ConfigDict
from phi.document import Document
from phi.document.reader.base import Reader
from phi.vectordb import VectorDb
from phi.utils.log import logger
class AssistantKnowledge(BaseModel):
"""Base class for LLM knowledge base"""
# Reader to read the documents
reader: Optional[Reader] = None
# Vector db to store the knowledge base
vector_db: Optional[VectorDb] = None
# Number of relevant documents to return on search
num_documents: int = 5
# Number of documents to optimize the vector db on
optimize_on: Optional[int] = 1000
model_config = ConfigDict(arbitrary_types_allowed=True)
@property
def document_lists(self) -> Iterator[List[Document]]:
"""Iterator that yields lists of documents in the knowledge base
Each object yielded by the iterator is a list of documents.
"""
raise NotImplementedError
def search(self, query: str, num_documents: Optional[int] = None) -> List[Document]:
"""Returns relevant documents matching the query"""
try:
if self.vector_db is None:
logger.warning("No vector db provided")
return []
_num_documents = num_documents or self.num_documents
logger.debug(f"Getting {_num_documents} relevant documents for query: {query}")
return self.vector_db.search(query=query, limit=_num_documents)
except Exception as e:
logger.error(f"Error searching for documents: {e}")
return []
def load(self, recreate: bool = False, upsert: bool = False, skip_existing: bool = True) -> None:
"""Load the knowledge base to the vector db
Args:
recreate (bool): If True, recreates the collection in the vector db. Defaults to False.
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db when inserting. Defaults to True.
"""
if self.vector_db is None:
logger.warning("No vector db provided")
return
if recreate:
logger.info("Deleting collection")
self.vector_db.delete()
logger.info("Creating collection")
self.vector_db.create()
logger.info("Loading knowledge base")
num_documents = 0
for document_list in self.document_lists:
documents_to_load = document_list
# Upsert documents if upsert is True and vector db supports upsert
if upsert and self.vector_db.upsert_available():
self.vector_db.upsert(documents=documents_to_load)
# Insert documents
else:
# Filter out documents which already exist in the vector db
if skip_existing:
documents_to_load = [
document for document in document_list if not self.vector_db.doc_exists(document)
]
self.vector_db.insert(documents=documents_to_load)
num_documents += len(documents_to_load)
logger.info(f"Added {len(documents_to_load)} documents to knowledge base")
if self.optimize_on is not None and num_documents > self.optimize_on:
logger.info("Optimizing Vector DB")
self.vector_db.optimize()
def load_documents(self, documents: List[Document], upsert: bool = False, skip_existing: bool = True) -> None:
"""Load documents to the knowledge base
Args:
documents (List[Document]): List of documents to load
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db when inserting. Defaults to True.
"""
logger.info("Loading knowledge base")
if self.vector_db is None:
logger.warning("No vector db provided")
return
logger.debug("Creating collection")
self.vector_db.create()
# Upsert documents if upsert is True
if upsert and self.vector_db.upsert_available():
self.vector_db.upsert(documents=documents)
logger.info(f"Loaded {len(documents)} documents to knowledge base")
return
# Filter out documents which already exist in the vector db
documents_to_load = (
[document for document in documents if not self.vector_db.doc_exists(document)]
if skip_existing
else documents
)
# Insert documents
if len(documents_to_load) > 0:
self.vector_db.insert(documents=documents_to_load)
logger.info(f"Loaded {len(documents_to_load)} documents to knowledge base")
else:
logger.info("No new documents to load")
def load_document(self, document: Document, upsert: bool = False, skip_existing: bool = True) -> None:
"""Load a document to the knowledge base
Args:
document (Document): Document to load
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True.
"""
self.load_documents(documents=[document], upsert=upsert, skip_existing=skip_existing)
def load_dict(self, document: Dict[str, Any], upsert: bool = False, skip_existing: bool = True) -> None:
"""Load a dictionary representation of a document to the knowledge base
Args:
document (Dict[str, Any]): Dictionary representation of a document
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True.
"""
self.load_documents(documents=[Document.from_dict(document)], upsert=upsert, skip_existing=skip_existing)
def load_json(self, document: str, upsert: bool = False, skip_existing: bool = True) -> None:
"""Load a json representation of a document to the knowledge base
Args:
document (str): Json representation of a document
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True.
"""
self.load_documents(documents=[Document.from_json(document)], upsert=upsert, skip_existing=skip_existing)
def load_text(self, text: str, upsert: bool = False, skip_existing: bool = True) -> None:
"""Load a text to the knowledge base
Args:
text (str): Text to load to the knowledge base
upsert (bool): If True, upserts documents to the vector db. Defaults to False.
skip_existing (bool): If True, skips documents which already exist in the vector db. Defaults to True.
"""
self.load_documents(documents=[Document(content=text)], upsert=upsert, skip_existing=skip_existing)
def exists(self) -> bool:
"""Returns True if the knowledge base exists"""
if self.vector_db is None:
logger.warning("No vector db provided")
return False
return self.vector_db.exists()
def clear(self) -> bool:
"""Clear the knowledge base"""
if self.vector_db is None:
logger.warning("No vector db available")
return True
return self.vector_db.clear()
|