from typing import List from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_google_genai import GoogleGenerativeAIEmbeddings from langchain_community.vectorstores import FAISS import google.generativeai as genai import os class GenerateFIASSDB: def __init__(self,pdf_docs : List[str], save_loc:str, model_embeddings: str = "models/embedding-001")-> None: self.save_loc = save_loc self.embedding = model_embeddings text = self.get_pdf_text(pdf_docs) text_chunks = self.get_text_chunks(text) self.get_vector_store(text_chunks) pass #configure gen ai key from config file def get_pdf_text(self,pdf_docs : List[str]) -> str: text = "" for pdf in pdf_docs: pdf_reader= PdfReader(pdf) for page in pdf_reader.pages: text+= page.extract_text() return text def get_text_chunks(self, text : str) -> List: text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) chunks = text_splitter.split_text(text) return chunks def get_vector_store(self, text_chunks : List) -> None: embeddings = GoogleGenerativeAIEmbeddings(model = self.embedding) vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) vector_store.save_local(self.save_loc) class DB_Retriever: def __init__(self, db_loc : str, model_embeddings : str = "models/embedding-001") -> None: self.db_loc = db_loc try: genai.configure(api_key=os.environ["GOOGLE_API_KEY"]) except Exception as e: print(e) self.embeddings = GoogleGenerativeAIEmbeddings(model = model_embeddings) self.db = FAISS.load_local(self.db_loc, self.embeddings,allow_dangerous_deserialization = True) def retrieve(self, query : str) -> List[str]: # docs = self.db.similarity_search(query) retriver = self.db.as_retriever() # output_docs = retriver.invoke(query) # return output_docs return retriver