DrishtiSharma commited on
Commit
1a980de
·
verified ·
1 Parent(s): 8d048bf

Create build_rag.py

Browse files
Files changed (1) hide show
  1. utils/build_rag.py +55 -0
utils/build_rag.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import Chroma
2
+ from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
3
+ from langchain.text_splitter import CharacterTextSplitter,TokenTextSplitter
4
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
5
+ from dotenv import load_dotenv
6
+ import os
7
+
8
+ load_dotenv()
9
+
10
+ class RAG:
11
+ def __init__(self) -> None:
12
+ self.pdf_folder_path = os.getenv('SOURCE_DATA')
13
+ self.emb_model_path = os.getenv('EMBED_MODEL')
14
+ self.emb_model = self.get_embedding_model(self.emb_model_path)
15
+ self.vector_store_path = os.getenv('VECTOR_STORE')
16
+
17
+ def load_docs(self,path:str) -> PyPDFDirectoryLoader:
18
+ loader = PyPDFDirectoryLoader(path)
19
+ docs = loader.load()
20
+ return docs
21
+
22
+ def get_embedding_model(self,emb_model) -> HuggingFaceBgeEmbeddings :
23
+ model_kwargs = {'device': 'cpu'}
24
+ encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
25
+ embeddings_model = HuggingFaceBgeEmbeddings(
26
+ model_name=emb_model,
27
+ model_kwargs=model_kwargs,
28
+ encode_kwargs=encode_kwargs,
29
+ )
30
+ return embeddings_model
31
+
32
+ def split_docs(self,docs)-> TokenTextSplitter:
33
+ text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
34
+ documents = text_splitter.split_documents(docs)
35
+ return documents
36
+
37
+ def populate_vector_db(self) -> None:
38
+ # load embeddings into Chroma - need to pass docs , embedding function and path of the db
39
+
40
+ self.doc = self.load_docs(self.pdf_folder_path)
41
+ self.documents = self.split_docs(self.doc)
42
+
43
+ db = Chroma.from_documents(self.documents,
44
+ embedding=self.emb_model,
45
+ persist_directory=self.vector_store_path)
46
+
47
+ db.persist()
48
+
49
+ def load_vector_db(self)-> Chroma:
50
+ #to load back the embeddings from disk
51
+ db = Chroma(persist_directory=self.vector_store_path,embedding_function=self.emb_model)
52
+ return db
53
+
54
+ def get_retriever(self) -> Chroma:
55
+ return self.load_vector_db().as_retriever()