self-corrective-rag / utils /build_rag.py
DrishtiSharma's picture
Update utils/build_rag.py
9d0f9a0 verified
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter,TokenTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import os
class RAG:
def __init__(self) -> None:
self.pdf_folder_path = os.getenv('SOURCE_DATA')
self.emb_model_path = os.getenv('EMBED_MODEL')
self.emb_model = self.get_embedding_model(self.emb_model_path)
self.vector_store_path = os.getenv('VECTOR_STORE')
def load_docs(self,path:str) -> PyPDFDirectoryLoader:
loader = PyPDFDirectoryLoader(path)
docs = loader.load()
return docs
def get_embedding_model(self,emb_model) -> HuggingFaceBgeEmbeddings :
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings_model = HuggingFaceBgeEmbeddings(
model_name=emb_model,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
)
return embeddings_model
def split_docs(self,docs)-> TokenTextSplitter:
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
documents = text_splitter.split_documents(docs)
return documents
def populate_vector_db(self) -> None:
# load embeddings into Chroma - need to pass docs , embedding function and path of the db
self.doc = self.load_docs(self.pdf_folder_path)
self.documents = self.split_docs(self.doc)
db = Chroma.from_documents(self.documents,
embedding=self.emb_model,
persist_directory=self.vector_store_path)
db.persist()
def load_vector_db(self)-> Chroma:
#to load back the embeddings from disk
db = Chroma(persist_directory=self.vector_store_path,embedding_function=self.emb_model)
return db
def get_retriever(self) -> Chroma:
return self.load_vector_db().as_retriever()