Spaces:
Sleeping
Sleeping
from langchain_community.vectorstores import Chroma | |
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader | |
from langchain.text_splitter import CharacterTextSplitter,TokenTextSplitter | |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
import os | |
class RAG: | |
def __init__(self) -> None: | |
self.pdf_folder_path = os.getenv('SOURCE_DATA') | |
self.emb_model_path = os.getenv('EMBED_MODEL') | |
self.emb_model = self.get_embedding_model(self.emb_model_path) | |
self.vector_store_path = os.getenv('VECTOR_STORE') | |
def load_docs(self,path:str) -> PyPDFDirectoryLoader: | |
loader = PyPDFDirectoryLoader(path) | |
docs = loader.load() | |
return docs | |
def get_embedding_model(self,emb_model) -> HuggingFaceBgeEmbeddings : | |
model_kwargs = {'device': 'cpu'} | |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity | |
embeddings_model = HuggingFaceBgeEmbeddings( | |
model_name=emb_model, | |
model_kwargs=model_kwargs, | |
encode_kwargs=encode_kwargs, | |
) | |
return embeddings_model | |
def split_docs(self,docs)-> TokenTextSplitter: | |
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0) | |
documents = text_splitter.split_documents(docs) | |
return documents | |
def populate_vector_db(self) -> None: | |
# load embeddings into Chroma - need to pass docs , embedding function and path of the db | |
self.doc = self.load_docs(self.pdf_folder_path) | |
self.documents = self.split_docs(self.doc) | |
db = Chroma.from_documents(self.documents, | |
embedding=self.emb_model, | |
persist_directory=self.vector_store_path) | |
db.persist() | |
def load_vector_db(self)-> Chroma: | |
#to load back the embeddings from disk | |
db = Chroma(persist_directory=self.vector_store_path,embedding_function=self.emb_model) | |
return db | |
def get_retriever(self) -> Chroma: | |
return self.load_vector_db().as_retriever() |