File size: 2,145 Bytes
1a980de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter,TokenTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import os


class RAG:
    def __init__(self) -> None:
        self.pdf_folder_path = os.getenv('SOURCE_DATA')
        self.emb_model_path = os.getenv('EMBED_MODEL')
        self.emb_model = self.get_embedding_model(self.emb_model_path)
        self.vector_store_path = os.getenv('VECTOR_STORE')

    def load_docs(self,path:str) -> PyPDFDirectoryLoader:
        loader = PyPDFDirectoryLoader(path)
        docs = loader.load()
        return docs
    
    def get_embedding_model(self,emb_model) -> HuggingFaceBgeEmbeddings :
        model_kwargs = {'device': 'cpu'}
        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
        embeddings_model = HuggingFaceBgeEmbeddings(
            model_name=emb_model,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs,
        )
        return embeddings_model
    
    def split_docs(self,docs)-> TokenTextSplitter:
        text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
        documents = text_splitter.split_documents(docs)
        return documents
    
    def populate_vector_db(self) -> None:
        # load embeddings into Chroma - need to pass docs , embedding function and path of the db

        self.doc = self.load_docs(self.pdf_folder_path)
        self.documents = self.split_docs(self.doc)
        
        db = Chroma.from_documents(self.documents,
                                   embedding=self.emb_model,
                                   persist_directory=self.vector_store_path)
        
        db.persist()
    
    def load_vector_db(self)-> Chroma:
        #to load back the embeddings from disk 
        db = Chroma(persist_directory=self.vector_store_path,embedding_function=self.emb_model)
        return db
    
    def get_retriever(self) -> Chroma:
        return self.load_vector_db().as_retriever()