from langchain.vectorstores import Chroma from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import os from typing import List from langchain.embeddings import HuggingFaceEmbeddings class PrepareVectorDB: """ A class for preparing and saving a VectorDB using OpenAI embeddings. This class facilitates the process of loading documents, chunking them, and creating a VectorDB with OpenAI embeddings. It provides methods to prepare and save the VectorDB. Parameters: data_directory (str or List[str]): The directory or list of directories containing the documents. persist_directory (str): The directory to save the VectorDB. chunk_size (int): The size of the chunks for document processing. chunk_overlap (int): The overlap between chunks. """ def __init__( self, data_directory: str, persist_directory: str, chunk_size: int, chunk_overlap: int ) -> None: """ Initialize the PrepareVectorDB instance. Parameters: data_directory (str or List[str]): The directory or list of directories containing the documents. persist_directory (str): The directory to save the VectorDB. chunk_size (int): The size of the chunks for document processing. chunk_overlap (int): The overlap between chunks. """ self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", " ", ""] ) """Other options: CharacterTextSplitter, TokenTextSplitter, etc.""" self.data_directory = data_directory self.persist_directory = persist_directory self.embedding_function = HuggingFaceEmbeddings( model_name="NeuML/pubmedbert-base-embeddings", # cache_folder=os.getenv('SENTENCE_TRANSFORMERS_HOME') ) def __load_all_documents(self) -> List: """ Load all documents from the specified directory or directories. Returns: List: A list of loaded documents. """ doc_counter = 0 if isinstance(self.data_directory, list): print("Loading the uploaded documents...") docs = [] for doc_dir in self.data_directory: docs.extend(PyPDFLoader(doc_dir).load()) doc_counter += 1 print("Number of loaded documents:", doc_counter) print("Number of pages:", len(docs), "\n\n") else: print("Loading documents manually...") document_list = os.listdir(self.data_directory) docs = [] for doc_name in document_list: docs.extend(PyPDFLoader(os.path.join( self.data_directory, doc_name)).load()) doc_counter += 1 print("Number of loaded documents:", doc_counter) print("Number of pages:", len(docs), "\n\n") return docs def __chunk_documents(self, docs: List) -> List: """ Chunk the loaded documents using the specified text splitter. Parameters: docs (List): The list of loaded documents. Returns: List: A list of chunked documents. """ print("Chunking documents...") chunked_documents = self.text_splitter.split_documents(docs) print("Number of chunks:", len(chunked_documents), "\n\n") return chunked_documents def prepare_and_save_vectordb(self): """ Load, chunk, and create a VectorDB with OpenAI embeddings, and save it. Returns: Chroma: The created VectorDB. """ docs = self.__load_all_documents() chunked_documents = self.__chunk_documents(docs) print("Preparing vectordb...") vectordb = Chroma.from_documents( documents=chunked_documents, embedding=self.embedding_function, persist_directory=self.persist_directory ) print("VectorDB is created and saved.") print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n") return vectordb