NDMO_english_assistant / vectorstore.py
Fawaz0ibra's picture
Upload 8 files
c002818 verified
# vectorstore.py
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
def load_or_build_vectorstore(local_file: str, index_folder: str, embeddings):
"""
Loads a local FAISS index if it exists; otherwise,
builds a new index from the specified PDF file.
"""
if os.path.exists(index_folder):
print("Loading existing FAISS index from disk...")
vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
else:
print("Building a new FAISS index...")
loader = PyPDFLoader(local_file)
documents = loader.load()
text_splitter = SemanticChunker(
embeddings=embeddings,
breakpoint_threshold_type='percentile',
breakpoint_threshold_amount=90
)
chunked_docs = text_splitter.split_documents(documents)
print(f"Document split into {len(chunked_docs)} chunks.")
vectorstore = FAISS.from_documents(chunked_docs, embeddings)
vectorstore.save_local(index_folder)
return vectorstore