File size: 4,705 Bytes
e899e0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings



class PrepareVectorDB:
    """
    A class for preparing and saving a VectorDB using OpenAI embeddings.

    Involves process of loading documents, chunking them, and creating a VectorDB 
    with OpenAI embeddings. contains methods to prepare & save the vecotordb.

    Parameters:
        data_directory (str):  Directory or list of directories containing the documents.
        persist_directory (str): Directory to save the VectorDB.
        embedding_model_engine (str): The engine for OpenAI embeddings.
        chunk_size (int): The size of the chunks for document processing.
        chunk_overlap (int): The overlap between chunks.
    """

    def __init__(
            self,
            data_directory: str,
            persist_directory: str,
            embedding_model_engine: str,
            chunk_size: int,
            chunk_overlap: int) -> None:
        
        """
        Initializing the PrepareVectorDB instance.

        Parameters:
            data_directory (str):  Directory or list of directories containing the documents.
            persist_directory (str): Directory to save the VectorDB.
            embedding_model_engine (str): The engine for OpenAI embeddings.
            chunk_size (int): The size of the chunks for document processing.
            chunk_overlap (int): The overlap between chunks.
        """

        self.embedding_model_engine = embedding_model_engine
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[
                "\n#{1,6} ",
                "```\n",
                "\n\\*\\*\\*+\n",
                "\n---+\n",
                "\n___+\n",
                "\n\n",
                "\n",
                " ",
                "",
                ]
        )
        """choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc."""
        self.data_directory = data_directory
        self.persist_directory = persist_directory
        self.embedding = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
                                                  model_kwargs={'device': 'cpu'},
                                                  encode_kwargs={'normalize_embeddings': True})

    def __load_all_documents(self) -> List:
        """
        Load all documents from the specified directory or directories and 
        handles the documents obtained live during chat. 

        Returns:
            List: A list of loaded documents.
        """
        doc_counter = 0
        if isinstance(self.data_directory, list):
            print("Loading the uploaded documents...")
            docs = [doc for doc_dir in self.data_directory
                    for doc in PyPDFLoader(doc_dir).load()]
        else:
            print("Loading documents manually...")
            document_list = os.listdir(self.data_directory)
            docs = [doc for doc_name in document_list
                    for doc in PyPDFLoader(os.path.join(
                        self.data_directory, doc_name)).load()]
        doc_counter = len(docs)
        print(f"Number of loaded documents: {doc_counter}")
        print(f"Number of pages: {len(docs)}\n\n")

        return docs

    def __chunk_documents(self, docs: List) -> List:
        """
        Chunk the loaded documents using the specified text splitter.
        Parameters:
            docs (List): The list of loaded documents.
        Returns:
            List: A list of chunked documents.
        """
        print("Chunking documents...")
        chunked_documents = self.text_splitter.split_documents(docs)
        print("Number of chunks:", len(chunked_documents), "\n\n")
        return chunked_documents

    def prepare_and_save_vectordb(self):
        """
        Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.

        Returns:
            Chroma: The created VectorDB.
        """
        docs = self.__load_all_documents()
        chunked_documents = self.__chunk_documents(docs)
        print("Preparing vectordb...")
        vectordb = Chroma.from_documents(
            documents=chunked_documents,
            embedding=self.embedding,
            persist_directory=self.persist_directory
        )
        print("Vectordb created and saved!")
        print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
        return vectordb