Spaces:
Sleeping
Sleeping
from typing import List | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
from langchain_community.vectorstores import FAISS | |
import google.generativeai as genai | |
import os | |
class GenerateFIASSDB: | |
def __init__(self,pdf_docs : List[str], save_loc:str, model_embeddings: str = "models/embedding-001")-> None: | |
self.save_loc = save_loc | |
self.embedding = model_embeddings | |
text = self.get_pdf_text(pdf_docs) | |
text_chunks = self.get_text_chunks(text) | |
self.get_vector_store(text_chunks) | |
pass #configure gen ai key from config file | |
def get_pdf_text(self,pdf_docs : List[str]) -> str: | |
text = "" | |
for pdf in pdf_docs: | |
pdf_reader= PdfReader(pdf) | |
for page in pdf_reader.pages: | |
text+= page.extract_text() | |
return text | |
def get_text_chunks(self, text : str) -> List: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def get_vector_store(self, text_chunks : List) -> None: | |
embeddings = GoogleGenerativeAIEmbeddings(model = self.embedding) | |
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) | |
vector_store.save_local(self.save_loc) | |
class DB_Retriever: | |
def __init__(self, db_loc : str, model_embeddings : str = "models/embedding-001") -> None: | |
self.db_loc = db_loc | |
try: | |
genai.configure(api_key=os.environ["GOOGLE_API_KEY"]) | |
except Exception as e: | |
print(e) | |
self.embeddings = GoogleGenerativeAIEmbeddings(model = model_embeddings) | |
self.db = FAISS.load_local(self.db_loc, self.embeddings,allow_dangerous_deserialization = True) | |
def retrieve(self, query : str) -> List[str]: | |
# docs = self.db.similarity_search(query) | |
retriver = self.db.as_retriever() | |
# output_docs = retriver.invoke(query) | |
# return output_docs | |
return retriver | |