Spaces:
Sleeping
Sleeping
import os | |
from typing import List | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_community.vectorstores import Chroma | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceBgeEmbeddings | |
class PrepareVectorDB: | |
""" | |
A class for preparing and saving a VectorDB using OpenAI embeddings. | |
Involves process of loading documents, chunking them, and creating a VectorDB | |
with OpenAI embeddings. contains methods to prepare & save the vecotordb. | |
Parameters: | |
data_directory (str): Directory or list of directories containing the documents. | |
persist_directory (str): Directory to save the VectorDB. | |
embedding_model_engine (str): The engine for OpenAI embeddings. | |
chunk_size (int): The size of the chunks for document processing. | |
chunk_overlap (int): The overlap between chunks. | |
""" | |
def __init__( | |
self, | |
data_directory: str, | |
persist_directory: str, | |
embedding_model_engine: str, | |
chunk_size: int, | |
chunk_overlap: int) -> None: | |
""" | |
Initializing the PrepareVectorDB instance. | |
Parameters: | |
data_directory (str): Directory or list of directories containing the documents. | |
persist_directory (str): Directory to save the VectorDB. | |
embedding_model_engine (str): The engine for OpenAI embeddings. | |
chunk_size (int): The size of the chunks for document processing. | |
chunk_overlap (int): The overlap between chunks. | |
""" | |
self.embedding_model_engine = embedding_model_engine | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
separators=[ | |
"\n#{1,6} ", | |
"```\n", | |
"\n\\*\\*\\*+\n", | |
"\n---+\n", | |
"\n___+\n", | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
) | |
"""choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc.""" | |
self.data_directory = data_directory | |
self.persist_directory = persist_directory | |
self.embedding = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5", | |
model_kwargs={'device': 'cpu'}, | |
encode_kwargs={'normalize_embeddings': True}) | |
def __load_all_documents(self) -> List: | |
""" | |
Load all documents from the specified directory or directories and | |
handles the documents obtained live during chat. | |
Returns: | |
List: A list of loaded documents. | |
""" | |
doc_counter = 0 | |
if isinstance(self.data_directory, list): | |
print("Loading the uploaded documents...") | |
docs = [doc for doc_dir in self.data_directory | |
for doc in PyPDFLoader(doc_dir).load()] | |
else: | |
print("Loading documents manually...") | |
document_list = os.listdir(self.data_directory) | |
docs = [doc for doc_name in document_list | |
for doc in PyPDFLoader(os.path.join( | |
self.data_directory, doc_name)).load()] | |
doc_counter = len(docs) | |
print(f"Number of loaded documents: {doc_counter}") | |
print(f"Number of pages: {len(docs)}\n\n") | |
return docs | |
def __chunk_documents(self, docs: List) -> List: | |
""" | |
Chunk the loaded documents using the specified text splitter. | |
Parameters: | |
docs (List): The list of loaded documents. | |
Returns: | |
List: A list of chunked documents. | |
""" | |
print("Chunking documents...") | |
chunked_documents = self.text_splitter.split_documents(docs) | |
print("Number of chunks:", len(chunked_documents), "\n\n") | |
return chunked_documents | |
def prepare_and_save_vectordb(self): | |
""" | |
Load, chunk, and create a VectorDB with OpenAI embeddings, and save it. | |
Returns: | |
Chroma: The created VectorDB. | |
""" | |
docs = self.__load_all_documents() | |
chunked_documents = self.__chunk_documents(docs) | |
print("Preparing vectordb...") | |
vectordb = Chroma.from_documents( | |
documents=chunked_documents, | |
embedding=self.embedding, | |
persist_directory=self.persist_directory | |
) | |
print("Vectordb created and saved!") | |
print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n") | |
return vectordb | |